Path: blob/main/winch/codegen/src/isa/x64/masm.rs
3050 views
use super::{1RegAlloc,2abi::X64ABI,3address::Address,4asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},5regs::{self, rbp, rsp, scratch_fpr_bitset, scratch_gpr_bitset},6};7use crate::masm::{8DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, FloatScratch, Imm as I, IntCmpKind,9IntScratch, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm,10RemKind, ReplaceLaneKind, RmwOp, RoundingMode, Scratch, ScratchType, ShiftKind, SplatKind,11StoreKind, TRUSTED_FLAGS, TrapCode, TruncKind, UNTRUSTED_FLAGS, V128AbsKind, V128AddKind,12V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, V128MaxKind, V128MinKind,13V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind, VectorCompareKind,14VectorEqualityKind, Zero,15};16use crate::{17Result,18abi::{self, LocalSlot, align_to, calculate_frame_adjustment},19bail,20codegen::{CodeGenContext, CodeGenError, Emission, FuncEnv, ptr_type_from_ptr_size},21format_err,22stack::{TypedReg, Val},23};24use crate::{25abi::{ABI, vmctx},26masm::{SPOffset, StackSlot},27};28use crate::{29isa::{30CallingConvention,31reg::{Reg, RegClass, WritableReg, writable},32},33masm::CalleeKind,34};35use cranelift_codegen::{36Final, MachBufferFinalized, MachLabel,37binemit::CodeOffset,38ir::{MemFlags, RelSourceLoc, SourceLoc},39isa::{40unwind::UnwindInst,41x64::{AtomicRmwSeqOp, args::CC, settings as x64_settings},42},43settings,44};45use wasmtime_cranelift::TRAP_UNREACHABLE;46use wasmtime_environ::{PtrSize, WasmValType};4748// Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`49// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we50// need to fix up the bits that migrate from one half of the lane to the51// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift52// right by 0 (no movement), we want to retain all the bits so we mask with53// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so54// we mask with `0x7f`; etc.5556#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.57const I8X16_ISHL_MASKS: [u8; 128] = [580xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,590xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,600xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,610xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,620xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,630xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,640xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,650x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,66];6768#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.69const I8X16_USHR_MASKS: [u8; 128] = [700xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,710x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,720x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,730x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,740x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,750x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,760x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,770x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,78];7980/// x64 MacroAssembler.81pub(crate) struct MacroAssembler {82/// Stack pointer offset.83sp_offset: u32,84/// This value represents the maximum stack size seen while compiling the function. While the85/// function is still being compiled its value will not be valid (the stack will grow and86/// shrink as space is reserved and freed during compilation), but once all instructions have87/// been seen this value will be the maximum stack usage seen.88sp_max: u32,89/// Add instructions that are used to add the constant stack max to a register.90stack_max_use_add: Option<PatchableAddToReg>,91/// Low level assembler.92asm: Assembler,93/// ISA flags.94flags: x64_settings::Flags,95/// Shared flags.vmcontext_store_context96shared_flags: settings::Flags,97/// The target pointer size.98ptr_size: OperandSize,99/// Scratch register scope.100scratch_scope: RegAlloc,101}102103impl Masm for MacroAssembler {104type Address = Address;105type Ptr = u8;106type ABI = X64ABI;107108fn frame_setup(&mut self) -> Result<()> {109let frame_pointer = rbp();110let stack_pointer = rsp();111112self.asm.push_r(frame_pointer);113114if self.shared_flags.unwind_info() {115self.asm.unwind_inst(UnwindInst::PushFrameRegs {116offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),117})118}119120self.asm121.mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);122123Ok(())124}125126fn check_stack(&mut self, vmctx: Reg) -> Result<()> {127let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();128129self.with_scratch::<IntScratch, _>(|masm, scratch| {130masm.load_ptr(131masm.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,132scratch.writable(),133)?;134135masm.load_ptr(136Address::offset(137scratch.inner(),138ptr_size.vmstore_context_stack_limit().into(),139),140scratch.writable(),141)?;142143masm.add_stack_max(scratch.inner());144145masm.asm.cmp_rr(scratch.inner(), regs::rsp(), masm.ptr_size);146masm.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);147wasmtime_environ::error::Ok(())148})?;149150// Emit unwind info.151if self.shared_flags.unwind_info() {152self.asm.unwind_inst(UnwindInst::DefineNewFrame {153offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),154155// The Winch calling convention has no callee-save registers, so nothing will be156// clobbered.157offset_downward_to_clobbers: 0,158})159}160Ok(())161}162163fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {164let bytes = match (reg.class(), size) {165(RegClass::Int, OperandSize::S64) => {166let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;167self.asm.push_r(reg);168self.increment_sp(word_bytes);169word_bytes170}171(RegClass::Int, OperandSize::S32) => {172let bytes = size.bytes();173self.reserve_stack(bytes)?;174let sp_offset = SPOffset::from_u32(self.sp_offset);175self.asm176.mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);177bytes178}179(RegClass::Float, _) => {180let bytes = size.bytes();181self.reserve_stack(bytes)?;182let sp_offset = SPOffset::from_u32(self.sp_offset);183self.asm184.xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);185bytes186}187_ => unreachable!(),188};189190Ok(StackSlot {191offset: SPOffset::from_u32(self.sp_offset),192size: bytes,193})194}195196fn reserve_stack(&mut self, bytes: u32) -> Result<()> {197if bytes == 0 {198return Ok(());199}200201self.asm202.sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);203self.increment_sp(bytes);204205Ok(())206}207208fn free_stack(&mut self, bytes: u32) -> Result<()> {209if bytes == 0 {210return Ok(());211}212self.asm213.add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);214self.decrement_sp(bytes);215216Ok(())217}218219fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {220self.sp_offset = offset.as_u32();221222Ok(())223}224225fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {226let (reg, offset) = if local.addressed_from_sp() {227let offset = self228.sp_offset229.checked_sub(local.offset)230.ok_or_else(|| CodeGenError::invalid_local_offset())?;231(rsp(), offset)232} else {233(rbp(), local.offset)234};235236Ok(Address::offset(reg, offset))237}238239fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {240Ok(Address::offset(241regs::rsp(),242self.sp_offset - offset.as_u32(),243))244}245246fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {247Ok(Address::offset(regs::rsp(), offset.as_u32()))248}249250fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {251Ok(Address::offset(vmctx!(Self), offset))252}253254fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {255self.store(src.into(), dst, self.ptr_size)256}257258fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {259self.store_impl(src, dst, size, TRUSTED_FLAGS)260}261262fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {263match kind {264StoreKind::Operand(size) => {265self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;266}267StoreKind::Atomic(size) => {268if size == OperandSize::S128 {269// TODO: we don't support 128-bit atomic store yet.270bail!(CodeGenError::unexpected_operand_size());271}272// To stay consistent with cranelift, we emit a normal store followed by a mfence,273// although, we could probably just emit a xchg.274self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;275self.asm.mfence();276}277StoreKind::VectorLane(LaneSelector { lane, size }) => {278self.ensure_has_avx()?;279self.asm280.xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS);281}282}283284Ok(())285}286287fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {288let current_sp = SPOffset::from_u32(self.sp_offset);289let _ = match (dst.to_reg().class(), size) {290(RegClass::Int, OperandSize::S32) => {291let addr = self.address_from_sp(current_sp)?;292self.asm.movzx_mr(293&addr,294dst,295size.extend_to::<Zero>(OperandSize::S64),296TRUSTED_FLAGS,297);298self.free_stack(size.bytes())?;299}300(RegClass::Int, OperandSize::S64) => {301self.asm.pop_r(dst);302self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);303}304(RegClass::Float, _) | (RegClass::Vector, _) => {305let addr = self.address_from_sp(current_sp)?;306self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);307self.free_stack(size.bytes())?;308}309_ => bail!(CodeGenError::invalid_operand_combination()),310};311Ok(())312}313314fn with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R {315let r = self316.scratch_scope317.reg_for_class(T::reg_class(), &mut |_| Ok(()))318.expect("Scratch register to be available");319320let ret = f(self, Scratch::new(r));321self.scratch_scope.free(r);322ret323}324325fn call(326&mut self,327stack_args_size: u32,328mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,329) -> Result<u32> {330let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();331let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();332let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);333let aligned_args_size = align_to(stack_args_size, alignment);334let total_stack = delta + aligned_args_size;335self.reserve_stack(total_stack)?;336let (callee, cc) = load_callee(self)?;337match callee {338CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),339CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),340};341Ok(total_stack)342}343344fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {345self.load(src, dst, self.ptr_size)346}347348fn compute_addr(349&mut self,350src: Self::Address,351dst: WritableReg,352size: OperandSize,353) -> Result<()> {354self.asm.lea(&src, dst, size);355Ok(())356}357358fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {359self.load_impl(src, dst, size, TRUSTED_FLAGS)360}361362fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {363let size = kind.derive_operand_size();364365match kind {366LoadKind::ScalarExtend(ext) => match ext {367ExtendKind::Signed(ext) => {368self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);369}370ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,371},372LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {373// The guarantees of the x86-64 memory model ensure that `SeqCst`374// loads are equivalent to normal loads.375if kind.is_atomic() && size == OperandSize::S128 {376bail!(CodeGenError::unexpected_operand_size());377}378379self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;380}381LoadKind::VectorExtend(ext) => {382self.ensure_has_avx()?;383self.asm384.xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)385}386LoadKind::Splat(_) => {387self.ensure_has_avx()?;388389if size == OperandSize::S64 {390self.asm391.xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);392self.asm.xmm_vpshuf_rr(393dst.to_reg(),394dst,395Self::vpshuf_mask_for_64_bit_splats(),396OperandSize::S32,397);398} else {399self.asm400.xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);401}402}403LoadKind::VectorLane(LaneSelector { lane, size }) => {404self.ensure_has_avx()?;405self.with_scratch::<IntScratch, _>(|masm, byte_tmp| {406masm.load_impl(src, byte_tmp.writable(), size, UNTRUSTED_FLAGS)?;407masm.asm408.xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp.inner(), lane, size);409wasmtime_environ::error::Ok(())410})?;411}412LoadKind::VectorZero(size) => {413self.ensure_has_avx()?;414self.with_scratch::<IntScratch, _>(|masm, scratch| {415masm.load_impl(src, scratch.writable(), size, UNTRUSTED_FLAGS)?;416masm.asm.avx_gpr_to_xmm(scratch.inner(), dst, size);417wasmtime_environ::error::Ok(())418})?;419}420}421422Ok(())423}424425fn sp_offset(&self) -> Result<SPOffset> {426Ok(SPOffset::from_u32(self.sp_offset))427}428429fn zero(&mut self, reg: WritableReg) -> Result<()> {430self.asm.xor_rr(431reg.to_reg(),432reg,433OperandSize::from_bytes(<Self::ABI>::word_bytes()),434);435Ok(())436}437438fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {439match (src, dst.to_reg()) {440(RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {441(RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),442(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),443_ => bail!(CodeGenError::invalid_operand_combination()),444},445(RegImm::Imm(imm), _) => self.load_constant(&imm, dst, size),446}447}448449fn cmov(450&mut self,451dst: WritableReg,452src: Reg,453cc: IntCmpKind,454size: OperandSize,455) -> Result<()> {456match (src.class(), dst.to_reg().class()) {457(RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),458(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),459_ => Err(format_err!(CodeGenError::invalid_operand_combination())),460}461}462463fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {464Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;465match (rhs, dst) {466(RegImm::Imm(imm), _) => {467if let Some(v) = imm.to_i32() {468self.asm.add_ir(v, dst, size);469} else {470self.with_scratch::<IntScratch, _>(|masm, scratch| {471masm.load_constant(&imm, scratch.writable(), size)?;472masm.asm.add_rr(scratch.inner(), dst, size);473wasmtime_environ::error::Ok(())474})?;475}476}477478(RegImm::Reg(src), dst) => {479self.asm.add_rr(src, dst, size);480}481}482483Ok(())484}485486fn checked_uadd(487&mut self,488dst: WritableReg,489lhs: Reg,490rhs: RegImm,491size: OperandSize,492trap: TrapCode,493) -> Result<()> {494self.add(dst, lhs, rhs, size)?;495self.asm.trapif(CC::B, trap);496Ok(())497}498499fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {500Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;501match (rhs, dst) {502(RegImm::Imm(imm), reg) => {503if let Some(v) = imm.to_i32() {504self.asm.sub_ir(v, reg, size);505} else {506self.with_scratch::<IntScratch, _>(|masm, scratch| {507masm.load_constant(&imm, scratch.writable(), size)?;508masm.asm.sub_rr(scratch.inner(), reg, size);509wasmtime_environ::error::Ok(())510})?;511}512}513514(RegImm::Reg(src), dst) => {515self.asm.sub_rr(src, dst, size);516}517}518519Ok(())520}521522fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {523Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;524match (rhs, dst) {525(RegImm::Imm(imm), _) => {526if let Some(v) = imm.to_i32() {527self.asm.mul_ir(v, dst, size);528} else {529self.with_scratch::<IntScratch, _>(|masm, scratch| {530masm.load_constant(&imm, scratch.writable(), size)?;531masm.asm.mul_rr(scratch.inner(), dst, size);532wasmtime_environ::error::Ok(())533})?;534}535}536537(RegImm::Reg(src), dst) => {538self.asm.mul_rr(src, dst, size);539}540}541542Ok(())543}544545fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {546Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;547self.asm.xmm_add_rr(rhs, dst, size);548Ok(())549}550551fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {552Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;553self.asm.xmm_sub_rr(rhs, dst, size);554Ok(())555}556557fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {558Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;559self.asm.xmm_mul_rr(rhs, dst, size);560Ok(())561}562563fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {564Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;565self.asm.xmm_div_rr(rhs, dst, size);566Ok(())567}568569fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {570Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;571self.asm.xmm_min_seq(rhs, dst, size);572Ok(())573}574575fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {576Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;577self.asm.xmm_max_seq(rhs, dst, size);578Ok(())579}580581fn float_copysign(582&mut self,583dst: WritableReg,584lhs: Reg,585rhs: Reg,586size: OperandSize,587) -> Result<()> {588Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;589let sign_mask = match size {590OperandSize::S32 => I::I32(0x80000000),591OperandSize::S64 => I::I64(0x8000000000000000),592OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {593bail!(CodeGenError::unexpected_operand_size())594}595};596597self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {598masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {599masm.load_constant(&sign_mask, scratch_gpr.writable(), size)?;600masm.asm601.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);602603// Clear everything except sign bit in src.604masm.asm605.xmm_and_rr(scratch_xmm.inner(), writable!(rhs), size);606607// Clear sign bit in dst using scratch to store result. Then copy the608// result back to dst.609masm.asm610.xmm_andn_rr(dst.to_reg(), scratch_xmm.writable(), size);611masm.asm.xmm_mov_rr(scratch_xmm.inner(), dst, size);612613// Copy sign bit from src to dst.614masm.asm.xmm_or_rr(rhs, dst, size);615Ok(())616})617})618}619620fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {621debug_assert_eq!(dst.to_reg().class(), RegClass::Float);622let mask = match size {623OperandSize::S32 => I::I32(0x80000000),624OperandSize::S64 => I::I64(0x8000000000000000),625OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {626bail!(CodeGenError::unexpected_operand_size())627}628};629self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {630masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {631masm.load_constant(&mask, scratch_gpr.writable(), size)?;632masm.asm633.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);634masm.asm.xmm_xor_rr(scratch_xmm.inner(), dst, size);635Ok(())636})637})638}639640fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {641debug_assert_eq!(dst.to_reg().class(), RegClass::Float);642let mask = match size {643OperandSize::S32 => I::I32(0x7fffffff),644OperandSize::S64 => I::I64(0x7fffffffffffffff),645OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {646bail!(CodeGenError::unexpected_operand_size())647}648};649650self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {651masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {652masm.load_constant(&mask, scratch_gpr.writable(), size)?;653654masm.asm655.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);656masm.asm.xmm_and_rr(scratch_xmm.inner(), dst, size);657Ok(())658})659})660}661662fn float_round<663F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,664>(665&mut self,666mode: RoundingMode,667env: &mut FuncEnv<Self::Ptr>,668context: &mut CodeGenContext<Emission>,669size: OperandSize,670mut fallback: F,671) -> Result<()> {672if self.flags.has_sse41() {673let src = context.pop_to_reg(self, None)?;674self.asm675.xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);676context.stack.push(src.into());677Ok(())678} else {679fallback(env, context, self)680}681}682683fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {684self.asm.sqrt(src, dst, size);685Ok(())686}687688fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {689Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;690match (rhs, dst) {691(RegImm::Imm(imm), _) => {692if let Some(v) = imm.to_i32() {693self.asm.and_ir(v, dst, size);694} else {695self.with_scratch::<IntScratch, _>(|masm, scratch| {696masm.load_constant(&imm, scratch.writable(), size)?;697masm.asm.and_rr(scratch.inner(), dst, size);698wasmtime_environ::error::Ok(())699})?;700}701}702703(RegImm::Reg(src), dst) => {704self.asm.and_rr(src, dst, size);705}706}707708Ok(())709}710711fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {712Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;713match (rhs, dst) {714(RegImm::Imm(imm), _) => {715if let Some(v) = imm.to_i32() {716self.asm.or_ir(v, dst, size);717} else {718self.with_scratch::<IntScratch, _>(|masm, scratch| {719masm.load_constant(&imm, scratch.writable(), size)?;720masm.asm.or_rr(scratch.inner(), dst, size);721wasmtime_environ::error::Ok(())722})?;723}724}725726(RegImm::Reg(src), dst) => {727self.asm.or_rr(src, dst, size);728}729}730731Ok(())732}733734fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {735Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;736match (rhs, dst) {737(RegImm::Imm(imm), _) => {738if let Some(v) = imm.to_i32() {739self.asm.xor_ir(v, dst, size);740} else {741self.with_scratch::<IntScratch, _>(|masm, scratch| {742masm.load_constant(&imm, scratch.writable(), size)?;743masm.asm.xor_rr(scratch.inner(), dst, size);744wasmtime_environ::error::Ok(())745})?;746}747}748749(RegImm::Reg(src), _) => {750self.asm.xor_rr(src, dst, size);751}752}753754Ok(())755}756757fn shift_ir(758&mut self,759dst: WritableReg,760imm: I,761lhs: Reg,762kind: ShiftKind,763size: OperandSize,764) -> Result<()> {765Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;766self.asm767.shift_ir(imm.unwrap_as_u64() as u8, dst, kind, size);768Ok(())769}770771fn shift(772&mut self,773context: &mut CodeGenContext<Emission>,774kind: ShiftKind,775size: OperandSize,776) -> Result<()> {777// Number of bits to shift must be in the CL register.778let src = context.pop_to_reg(self, Some(regs::rcx()))?;779let dst = context.pop_to_reg(self, None)?;780781self.asm782.shift_rr(src.into(), writable!(dst.into()), kind, size);783784context.free_reg(src);785context.stack.push(dst.into());786787Ok(())788}789790fn div(791&mut self,792context: &mut CodeGenContext<Emission>,793kind: DivKind,794size: OperandSize,795) -> Result<()> {796// Allocate rdx:rax.797let rdx = context.reg(regs::rdx(), self)?;798let rax = context.reg(regs::rax(), self)?;799800// Allocate the divisor, which can be any gpr.801let divisor = context.pop_to_reg(self, None)?;802803// Mark rax as allocatable.804context.free_reg(rax);805// Move the top value to rax.806let rax = context.pop_to_reg(self, Some(rax))?;807self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);808809// Free the divisor and rdx.810context.free_reg(divisor);811context.free_reg(rdx);812813// Push the quotient.814context.stack.push(rax.into());815Ok(())816}817818fn rem(819&mut self,820context: &mut CodeGenContext<Emission>,821kind: RemKind,822size: OperandSize,823) -> Result<()> {824// Allocate rdx:rax.825let rdx = context.reg(regs::rdx(), self)?;826let rax = context.reg(regs::rax(), self)?;827828// Allocate the divisor, which can be any gpr.829let divisor = context.pop_to_reg(self, None)?;830831// Mark rax as allocatable.832context.free_reg(rax);833// Move the top value to rax.834let rax = context.pop_to_reg(self, Some(rax))?;835self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);836837// Free the divisor and rax.838context.free_reg(divisor);839context.free_reg(rax);840841// Push the remainder.842context.stack.push(Val::reg(rdx, divisor.ty));843844Ok(())845}846847fn frame_restore(&mut self) -> Result<()> {848debug_assert_eq!(self.sp_offset, 0);849self.asm.pop_r(writable!(rbp()));850self.asm.ret();851Ok(())852}853854fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {855if let Some(patch) = self.stack_max_use_add {856patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());857}858859Ok(self.asm.finalize(base))860}861862fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {863Ok(Address::offset(reg, offset))864}865866fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {867match src2 {868RegImm::Imm(imm) => {869if let Some(v) = imm.to_i32() {870self.asm.cmp_ir(src1, v, size);871} else {872self.with_scratch::<IntScratch, _>(|masm, scratch| {873masm.load_constant(&imm, scratch.writable(), size)?;874masm.asm.cmp_rr(src1, scratch.inner(), size);875wasmtime_environ::error::Ok(())876})?;877}878}879RegImm::Reg(src2) => {880self.asm.cmp_rr(src1, src2, size);881}882}883884Ok(())885}886887fn cmp_with_set(888&mut self,889dst: WritableReg,890src: RegImm,891kind: IntCmpKind,892size: OperandSize,893) -> Result<()> {894self.cmp(dst.to_reg(), src, size)?;895self.asm.setcc(kind, dst);896Ok(())897}898899fn float_cmp_with_set(900&mut self,901dst: WritableReg,902src1: Reg,903src2: Reg,904kind: FloatCmpKind,905size: OperandSize,906) -> Result<()> {907// Float comparisons needs to be ordered (that is, comparing with a NaN908// should return 0) except for not equal which needs to be unordered.909// We use ucomis{s, d} because comis{s, d} has an undefined result if910// either operand is NaN. Since ucomis{s, d} is unordered, we need to911// compensate to make the comparison ordered. Ucomis{s, d} sets the912// ZF, PF, and CF flags if there is an unordered result.913let (src1, src2, set_kind) = match kind {914FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),915FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),916FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),917FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),918// Reversing the operands and using the complementary comparison919// avoids needing to perform an additional SETNP and AND920// instruction.921// SETNB and SETNBE check if the carry flag is unset (i.e., not922// less than and not unordered) so we get the intended result923// without having to look at the parity flag.924FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),925FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),926};927self.asm.ucomis(src1, src2, size);928self.asm.setcc(set_kind, dst);929let _ = match kind {930FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {931// Return false if either operand is NaN by ensuring PF is932// unset.933self.with_scratch::<IntScratch, _>(|masm, scratch| {934masm.asm.setnp(scratch.writable());935masm.asm.and_rr(scratch.inner(), dst, size);936});937}938FloatCmpKind::Ne => {939// Return true if either operand is NaN by checking if PF is940// set.941self.with_scratch::<IntScratch, _>(|masm, scratch| {942masm.asm.setp(scratch.writable());943masm.asm.or_rr(scratch.inner(), dst, size);944});945}946FloatCmpKind::Lt | FloatCmpKind::Le => (),947};948Ok(())949}950951fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {952if self.flags.has_lzcnt() {953self.asm.lzcnt(src, dst, size);954} else {955self.with_scratch::<IntScratch, _>(|masm, scratch| {956// Use the following approach:957// dst = size.num_bits() - bsr(src) - is_not_zero958// = size.num.bits() + -bsr(src) - is_not_zero.959masm.asm.bsr(src, dst, size);960masm.asm.setcc(IntCmpKind::Ne, scratch.writable());961masm.asm.neg(dst.to_reg(), dst, size);962masm.asm.add_ir(size.num_bits() as i32, dst, size);963masm.asm.sub_rr(scratch.inner(), dst, size);964});965}966967Ok(())968}969970fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {971if self.flags.has_bmi1() {972self.asm.tzcnt(src, dst, size);973} else {974self.with_scratch::<IntScratch, _>(|masm, scratch| {975// Use the following approach:976// dst = bsf(src) + (is_zero * size.num_bits())977// = bsf(src) + (is_zero << size.log2()).978// BSF outputs the correct value for every value except 0.979// When the value is 0, BSF outputs 0, correct output for ctz is980// the number of bits.981masm.asm.bsf(src, dst, size);982masm.asm.setcc(IntCmpKind::Eq, scratch.writable());983masm.asm984.shift_ir(size.log2(), scratch.writable(), ShiftKind::Shl, size);985masm.asm.add_rr(scratch.inner(), dst, size);986});987}988989Ok(())990}991992fn get_label(&mut self) -> Result<MachLabel> {993let buffer = self.asm.buffer_mut();994Ok(buffer.get_label())995}996997fn bind(&mut self, label: MachLabel) -> Result<()> {998let buffer = self.asm.buffer_mut();999buffer.bind_label(label, &mut Default::default());1000Ok(())1001}10021003fn branch(1004&mut self,1005kind: IntCmpKind,1006lhs: Reg,1007rhs: RegImm,1008taken: MachLabel,1009size: OperandSize,1010) -> Result<()> {1011use IntCmpKind::*;10121013match &(lhs, rhs) {1014(rlhs, RegImm::Reg(rrhs)) => {1015// If the comparison kind is zero or not zero and both operands1016// are the same register, emit a test instruction. Else we emit1017// a normal comparison.1018if (kind == Eq || kind == Ne) && (rlhs == rrhs) {1019self.asm.test_rr(*rlhs, *rrhs, size);1020} else {1021self.cmp(lhs, rhs, size)?;1022}1023}1024_ => self.cmp(lhs, rhs, size)?,1025}1026self.asm.jmp_if(kind, taken);1027Ok(())1028}10291030fn jmp(&mut self, target: MachLabel) -> Result<()> {1031self.asm.jmp(target);1032Ok(())1033}10341035fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {1036let src = context.pop_to_reg(self, None)?;1037if self.flags.has_popcnt() && self.flags.has_sse42() {1038self.asm.popcnt(src.into(), writable!(src.into()), size);1039context.stack.push(src.into());1040Ok(())1041} else {1042// The fallback functionality here is based on `MacroAssembler::popcnt64` in:1043// https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#49510441045let tmp = writable!(context.any_gpr(self)?);1046let dst = writable!(src.into());1047let (masks, shift_amt) = match size {1048OperandSize::S64 => (1049[10500x5555555555555555, // m110510x3333333333333333, // m210520x0f0f0f0f0f0f0f0f, // m410530x0101010101010101, // h011054],105556u8,1056),1057// 32-bit popcount is the same, except the masks are half as1058// wide and we shift by 24 at the end rather than 561059OperandSize::S32 => (1060[0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],106124u8,1062),1063_ => bail!(CodeGenError::unexpected_operand_size()),1064};1065self.asm.mov_rr(src.into(), tmp, size);10661067// x -= (x >> 1) & m1;1068self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);1069let lhs = dst.to_reg();1070self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;1071self.asm.sub_rr(dst.to_reg(), tmp, size);10721073// x = (x & m2) + ((x >> 2) & m2);1074self.asm.mov_rr(tmp.to_reg(), dst, size);1075// Load `0x3333...` into the scratch reg once, allowing us to use1076// `and_rr` and avoid inadvertently loading it twice as with `and`10771078self.with_scratch::<IntScratch, _>(|masm, scratch| {1079masm.load_constant(&I::i64(masks[1]), scratch.writable(), size)?;1080masm.asm.and_rr(scratch.inner(), dst, size);1081masm.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);1082masm.asm.and_rr(scratch.inner(), tmp, size);1083wasmtime_environ::error::Ok(())1084})?;1085self.asm.add_rr(dst.to_reg(), tmp, size);10861087// x = (x + (x >> 4)) & m4;1088self.asm.mov_rr(tmp.to_reg(), dst, size);1089self.asm.shift_ir(4u8, dst, ShiftKind::ShrU, size);1090self.asm.add_rr(tmp.to_reg(), dst, size);1091let lhs = dst.to_reg();1092self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;10931094// (x * h01) >> shift_amt1095let lhs = dst.to_reg();1096self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;1097self.asm.shift_ir(shift_amt, dst, ShiftKind::ShrU, size);10981099context.stack.push(src.into());1100context.free_reg(tmp.to_reg());11011102Ok(())1103}1104}11051106fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {1107self.asm.mov_rr(src, dst, OperandSize::S32);1108Ok(())1109}11101111fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {1112match kind {1113ExtendKind::Signed(ext) => {1114self.asm.movsx_rr(src, dst, ext);1115}1116ExtendKind::Unsigned(ext) => {1117self.asm.movzx_rr(src, dst, ext);1118}1119}11201121Ok(())1122}11231124fn signed_truncate(1125&mut self,1126dst: WritableReg,1127src: Reg,1128src_size: OperandSize,1129dst_size: OperandSize,1130kind: TruncKind,1131) -> Result<()> {1132self.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {1133masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {1134masm.asm.cvt_float_to_sint_seq(1135src,1136dst,1137gpr_scratch.inner(),1138xmm_scratch.inner(),1139src_size,1140dst_size,1141kind.is_checked(),1142);1143Ok(())1144})1145})1146}11471148fn unsigned_truncate(1149&mut self,1150ctx: &mut CodeGenContext<Emission>,1151src_size: OperandSize,1152dst_size: OperandSize,1153kind: TruncKind,1154) -> Result<()> {1155let dst_ty = match dst_size {1156OperandSize::S32 => WasmValType::I32,1157OperandSize::S64 => WasmValType::I64,1158_ => bail!(CodeGenError::unexpected_operand_size()),1159};11601161ctx.convert_op_with_tmp_reg(1162self,1163dst_ty,1164RegClass::Float,1165|masm, dst, src, tmp_fpr, dst_size| {1166masm.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {1167masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {1168masm.asm.cvt_float_to_uint_seq(1169src,1170writable!(dst),1171gpr_scratch.inner(),1172xmm_scratch.inner(),1173tmp_fpr,1174src_size,1175dst_size,1176kind.is_checked(),1177);1178Ok(())1179})1180})1181},1182)1183}11841185fn signed_convert(1186&mut self,1187dst: WritableReg,1188src: Reg,1189src_size: OperandSize,1190dst_size: OperandSize,1191) -> Result<()> {1192self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);1193Ok(())1194}11951196fn unsigned_convert(1197&mut self,1198dst: WritableReg,1199src: Reg,1200tmp_gpr: Reg,1201src_size: OperandSize,1202dst_size: OperandSize,1203) -> Result<()> {1204// Need to convert unsigned uint32 to uint64 for conversion instruction sequence.1205if let OperandSize::S32 = src_size {1206self.extend(1207writable!(src),1208src,1209ExtendKind::Unsigned(Extend::I64Extend32),1210)?;1211}12121213self.with_scratch::<IntScratch, _>(|masm, scratch| {1214masm.asm1215.cvt_uint64_to_float_seq(src, dst, scratch.inner(), tmp_gpr, dst_size);1216Ok(())1217})1218}12191220fn reinterpret_float_as_int(1221&mut self,1222dst: WritableReg,1223src: Reg,1224size: OperandSize,1225) -> Result<()> {1226self.asm.xmm_to_gpr(src, dst, size);1227Ok(())1228}12291230fn reinterpret_int_as_float(1231&mut self,1232dst: WritableReg,1233src: Reg,1234size: OperandSize,1235) -> Result<()> {1236self.asm.gpr_to_xmm(src, dst, size);1237Ok(())1238}12391240fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {1241self.asm1242.cvt_float_to_float(src, dst, OperandSize::S64, OperandSize::S32);1243Ok(())1244}12451246fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {1247self.asm1248.cvt_float_to_float(src, dst, OperandSize::S32, OperandSize::S64);1249Ok(())1250}12511252fn unreachable(&mut self) -> Result<()> {1253self.asm.trap(TRAP_UNREACHABLE);1254Ok(())1255}12561257fn trap(&mut self, code: TrapCode) -> Result<()> {1258self.asm.trap(code);1259Ok(())1260}12611262fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {1263self.asm.trapif(cc, code);1264Ok(())1265}12661267fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {1268self.asm.test_rr(src, src, self.ptr_size);1269self.asm.trapif(IntCmpKind::Eq, code);1270Ok(())1271}12721273fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {1274// At least one default target.1275debug_assert!(targets.len() >= 1);1276let default_index = targets.len() - 1;1277// Emit bounds check, by conditionally moving the max cases1278// into the given index reg if the contents of the index reg1279// are greater.1280let max = default_index;1281let size = OperandSize::S32;1282self.asm.mov_ir(max as u64, writable!(tmp), size);1283self.asm.cmp_rr(tmp, index, size);1284self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);12851286let default = targets[default_index];1287let rest = &targets[0..default_index];12881289self.with_scratch::<IntScratch, _>(|masm, tmp1| {1290masm.asm1291.jmp_table(rest.into(), default, index, tmp1.inner(), tmp);1292Ok(())1293})1294}12951296fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {1297Ok(self.asm.buffer_mut().start_srcloc(loc))1298}12991300fn end_source_loc(&mut self) -> Result<()> {1301self.asm.buffer_mut().end_srcloc();1302Ok(())1303}13041305fn current_code_offset(&self) -> Result<CodeOffset> {1306Ok(self.asm.buffer().cur_offset())1307}13081309fn add128(1310&mut self,1311dst_lo: WritableReg,1312dst_hi: WritableReg,1313lhs_lo: Reg,1314lhs_hi: Reg,1315rhs_lo: Reg,1316rhs_hi: Reg,1317) -> Result<()> {1318Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;1319Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;1320self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);1321self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);1322Ok(())1323}13241325fn sub128(1326&mut self,1327dst_lo: WritableReg,1328dst_hi: WritableReg,1329lhs_lo: Reg,1330lhs_hi: Reg,1331rhs_lo: Reg,1332rhs_hi: Reg,1333) -> Result<()> {1334Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;1335Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;1336self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);1337self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);1338Ok(())1339}13401341fn mul_wide(1342&mut self,1343context: &mut CodeGenContext<Emission>,1344kind: MulWideKind,1345) -> Result<()> {1346// Reserve rax/rdx since they're required by the `mul_wide` instruction1347// being used here.1348let rax = context.reg(regs::rax(), self)?;1349let rdx = context.reg(regs::rdx(), self)?;13501351// The rhs of this binop can be in any register1352let rhs = context.pop_to_reg(self, None)?;1353// Mark rax as allocatable. and then force the lhs operand to be placed1354// in `rax`.1355context.free_reg(rax);1356let lhs = context.pop_to_reg(self, Some(rax))?;13571358self.asm.mul_wide(1359writable!(rax),1360writable!(rdx),1361lhs.reg,1362rhs.reg,1363kind,1364OperandSize::S64,1365);13661367// No longer using the rhs register after the multiplication has been1368// executed.1369context.free_reg(rhs);13701371// The low bits of the result are in rax, where `lhs` was allocated to1372context.stack.push(lhs.into());1373// The high bits of the result are in rdx, which we previously reserved.1374context.stack.push(Val::Reg(TypedReg::i64(rdx)));13751376Ok(())1377}13781379fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {1380// Get the source and destination operands set up first.1381let (src, dst) = match size {1382// Floats can use the same register for `src` and `dst`.1383SplatKind::F32x4 | SplatKind::F64x2 => {1384let reg = context.pop_to_reg(self, None)?.reg;1385(RegImm::reg(reg), writable!(reg))1386}1387// For ints, we need to load the operand into a vector register if1388// it's not a constant.1389SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {1390let dst = writable!(context.any_fpr(self)?);1391let src = if size == SplatKind::I64x2 {1392context.pop_i64_const().map(RegImm::i64)1393} else {1394context.pop_i32_const().map(RegImm::i32)1395}1396.map_or_else(1397|| -> Result<RegImm> {1398let reg = context.pop_to_reg(self, None)?.reg;1399self.reinterpret_int_as_float(1400dst,1401reg,1402match size {1403SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {1404OperandSize::S321405}1406SplatKind::I64x2 => OperandSize::S64,1407SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),1408},1409)?;1410context.free_reg(reg);1411Ok(RegImm::Reg(dst.to_reg()))1412},1413Ok,1414)?;1415(src, dst)1416}1417};14181419// Perform the splat on the operands.1420if size == SplatKind::I64x2 || size == SplatKind::F64x2 {1421self.ensure_has_avx()?;1422let mask = Self::vpshuf_mask_for_64_bit_splats();1423match src {1424RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),1425RegImm::Imm(imm) => {1426let src = self.asm.add_constant(&imm.to_bytes());1427self.asm1428.xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());1429}1430}1431} else {1432self.ensure_has_avx2()?;14331434match src {1435RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),1436RegImm::Imm(imm) => {1437let src = self.asm.add_constant(&imm.to_bytes());1438self.asm1439.xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());1440}1441}1442}14431444context1445.stack1446.push(Val::reg(dst.to_reg(), WasmValType::V128));1447Ok(())1448}14491450fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {1451self.ensure_has_avx()?;14521453// Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`1454// separately to either the selected index or 0.1455// Then use `vpor` to combine `lhs` and `rhs` into `dst`.1456// Setting the most significant bit in the mask's lane to 1 will1457// result in corresponding lane in the destination register being1458// set to 0. 0x80 sets the most significant bit to 1.1459let mut mask_lhs: [u8; 16] = [0x80; 16];1460let mut mask_rhs: [u8; 16] = [0x80; 16];1461for i in 0..lanes.len() {1462if lanes[i] < 16 {1463mask_lhs[i] = lanes[i];1464} else {1465mask_rhs[i] = lanes[i] - 16;1466}1467}1468let mask_lhs = self.asm.add_constant(&mask_lhs);1469let mask_rhs = self.asm.add_constant(&mask_rhs);14701471self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);1472self.with_scratch::<FloatScratch, _>(|masm, scratch| {1473masm.asm.xmm_vpshufb_rrm(scratch.writable(), rhs, &mask_rhs);1474masm.asm.xmm_vpor_rrr(dst, dst.to_reg(), scratch.inner());1475Ok(())1476})1477}14781479fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {1480self.ensure_has_avx()?;14811482// Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything1483// outside that range.1484// Each lane is a signed byte so the maximum value is 0x7F. Adding1485// 0x70 to any value higher than 0xF will saturate resulting in a value1486// of 0xFF (i.e., 0).1487let clamp = self.asm.add_constant(&[0x70; 16]);1488self.asm1489.xmm_vpaddus_rrm(writable!(rhs), rhs, &clamp, OperandSize::S8);14901491// Don't need to subtract 0x70 since `vpshufb` uses the least1492// significant 4 bits which are the same after adding 0x70.1493self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);1494Ok(())1495}14961497fn atomic_rmw(1498&mut self,1499context: &mut CodeGenContext<Emission>,1500addr: Self::Address,1501size: OperandSize,1502op: RmwOp,1503flags: MemFlags,1504extend: Option<Extend<Zero>>,1505) -> Result<()> {1506let res = match op {1507RmwOp::Add => {1508let operand = context.pop_to_reg(self, None)?;1509self.asm1510.lock_xadd(addr, writable!(operand.reg), size, flags);1511operand.reg1512}1513RmwOp::Sub => {1514let operand = context.pop_to_reg(self, None)?;1515self.asm.neg(operand.reg, writable!(operand.reg), size);1516self.asm1517.lock_xadd(addr, writable!(operand.reg), size, flags);1518operand.reg1519}1520RmwOp::Xchg => {1521let operand = context.pop_to_reg(self, None)?;1522self.asm.xchg(addr, writable!(operand.reg), size, flags);1523operand.reg1524}1525RmwOp::And | RmwOp::Or | RmwOp::Xor => {1526let op = match op {1527RmwOp::And => AtomicRmwSeqOp::And,1528RmwOp::Or => AtomicRmwSeqOp::Or,1529RmwOp::Xor => AtomicRmwSeqOp::Xor,1530_ => unreachable!(1531"invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"1532),1533};1534let dst = context.reg(regs::rax(), self)?;1535let operand = context.pop_to_reg(self, None)?;15361537self.with_scratch::<IntScratch, _>(|masm, scratch| {1538masm.asm.atomic_rmw_seq(1539addr,1540operand.reg,1541writable!(dst),1542scratch.writable(),1543size,1544flags,1545op,1546);1547});15481549context.free_reg(operand.reg);1550dst1551}1552};15531554let dst_ty = match extend {1555Some(ext) => {1556// We don't need to zero-extend from 32 to 64bits.1557if !(ext.from_bits() == 32 && ext.to_bits() == 64) {1558self.asm.movzx_rr(res, writable!(res), ext);1559}15601561WasmValType::int_from_bits(ext.to_bits())1562}1563None => WasmValType::int_from_bits(size.num_bits()),1564};15651566context.stack.push(TypedReg::new(dst_ty, res).into());15671568Ok(())1569}15701571fn extract_lane(1572&mut self,1573src: Reg,1574dst: WritableReg,1575lane: u8,1576kind: ExtractLaneKind,1577) -> Result<()> {1578self.ensure_has_avx()?;15791580match kind {1581ExtractLaneKind::I8x16S1582| ExtractLaneKind::I8x16U1583| ExtractLaneKind::I16x8S1584| ExtractLaneKind::I16x8U1585| ExtractLaneKind::I32x41586| ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),1587ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {1588// If the `src` and `dst` registers are the same, then the1589// appropriate value is already in the correct position in1590// the register.1591assert!(src == dst.to_reg());1592}1593ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),1594ExtractLaneKind::F64x2 => {1595// `0b11_10` selects the high and low 32-bits of the second1596// 64-bit, so `0b11_10_11_10` splats the 64-bit value across1597// both lanes. Since we put an `f64` on the stack, we use1598// the splatted value.1599// Double-check `lane == 0` was handled in another branch.1600assert!(lane == 1);1601self.asm1602.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)1603}1604}16051606// Sign-extend to 32-bits for sign extended kinds.1607match kind {1608ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {1609self.asm.movsx_rr(dst.to_reg(), dst, kind.into())1610}1611_ => (),1612}16131614Ok(())1615}16161617fn replace_lane(1618&mut self,1619src: RegImm,1620dst: WritableReg,1621lane: u8,1622kind: ReplaceLaneKind,1623) -> Result<()> {1624self.ensure_has_avx()?;16251626match kind {1627ReplaceLaneKind::I8x161628| ReplaceLaneKind::I16x81629| ReplaceLaneKind::I32x41630| ReplaceLaneKind::I64x2 => match src {1631RegImm::Reg(reg) => {1632self.asm1633.xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());1634}1635RegImm::Imm(imm) => {1636let address = self.asm.add_constant(&imm.to_bytes());1637self.asm1638.xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());1639}1640},1641ReplaceLaneKind::F32x4 => {1642// Immediate for `vinsertps` uses first 3 bits to determine1643// which elements of the destination to set to 0. The next 21644// bits specify which element of the destination will be1645// overwritten.1646let imm = lane << 4;1647match src {1648RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),1649RegImm::Imm(val) => {1650let address = self.asm.add_constant(&val.to_bytes());1651self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);1652}1653}1654}1655ReplaceLaneKind::F64x2 => match src {1656RegImm::Reg(reg) => match lane {16570 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),16581 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),1659_ => unreachable!(),1660},1661RegImm::Imm(imm) => {1662let address = self.asm.add_constant(&imm.to_bytes());1663match lane {16640 => {1665// Memory load variant of `vmovsd` zeroes the upper1666// 64 bits of the register so need to load the1667// immediate to a register to use the register1668// variant of `vmovsd` to perform the merge.16691670self.with_scratch::<FloatScratch, _>(|masm, scratch| {1671masm.asm.xmm_vmovsd_rm(scratch.writable(), &address);1672masm.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.inner());1673});1674}16751 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),1676_ => unreachable!(),1677}1678}1679},1680}1681Ok(())1682}16831684fn atomic_cas(1685&mut self,1686context: &mut CodeGenContext<Emission>,1687addr: Self::Address,1688size: OperandSize,1689flags: MemFlags,1690extend: Option<Extend<Zero>>,1691) -> Result<()> {1692// `cmpxchg` expects `expected` to be in the `*a*` register.1693// reserve rax for the expected argument.1694let rax = context.reg(regs::rax(), self)?;16951696let replacement = context.pop_to_reg(self, None)?;16971698// mark `rax` as allocatable again.1699context.free_reg(rax);1700let expected = context.pop_to_reg(self, Some(regs::rax()))?;17011702self.asm1703.cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags);17041705if let Some(extend) = extend {1706// We don't need to zero-extend from 32 to 64bits.1707if !(extend.from_bits() == 32 && extend.to_bits() == 64) {1708self.asm1709.movzx_rr(expected.reg, writable!(expected.reg), extend);1710}1711}17121713context.stack.push(expected.into());1714context.free_reg(replacement);17151716Ok(())1717}17181719fn v128_eq(1720&mut self,1721dst: WritableReg,1722lhs: Reg,1723rhs: Reg,1724kind: VectorEqualityKind,1725) -> Result<()> {1726self.ensure_has_avx()?;17271728match kind {1729VectorEqualityKind::I8x161730| VectorEqualityKind::I16x81731| VectorEqualityKind::I32x41732| VectorEqualityKind::I64x2 => {1733self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())1734}1735VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {1736self.asm1737.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)1738}1739}1740Ok(())1741}17421743fn v128_ne(1744&mut self,1745dst: WritableReg,1746lhs: Reg,1747rhs: Reg,1748kind: VectorEqualityKind,1749) -> Result<()> {1750self.ensure_has_avx()?;17511752match kind {1753VectorEqualityKind::I8x161754| VectorEqualityKind::I16x81755| VectorEqualityKind::I32x41756| VectorEqualityKind::I64x2 => {1757// Check for equality and invert the results.1758self.asm1759.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1760self.asm1761.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1762self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1763}1764VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {1765self.asm1766.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)1767}1768}1769Ok(())1770}17711772fn v128_lt(1773&mut self,1774dst: WritableReg,1775lhs: Reg,1776rhs: Reg,1777kind: VectorCompareKind,1778) -> Result<()> {1779self.ensure_has_avx()?;17801781match kind {1782VectorCompareKind::I8x16S1783| VectorCompareKind::I16x8S1784| VectorCompareKind::I32x4S1785| VectorCompareKind::I64x2S => {1786// Perform a greater than check with reversed parameters.1787self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())1788}1789VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1790// Set `lhs` to min values, check for equality, then invert the1791// result.1792// If `lhs` is smaller, then equality check will fail and result1793// will be inverted to true. Otherwise the equality check will1794// pass and be inverted to false.1795self.asm1796.xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1797self.asm1798.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1799self.asm1800.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1801self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1802}1803VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1804self.asm1805.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)1806}1807}1808Ok(())1809}18101811fn v128_le(1812&mut self,1813dst: WritableReg,1814lhs: Reg,1815rhs: Reg,1816kind: VectorCompareKind,1817) -> Result<()> {1818self.ensure_has_avx()?;18191820match kind {1821VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {1822// Set the `rhs` vector to the signed minimum values and then1823// compare them with `lhs` for equality.1824self.asm1825.xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1826self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1827}1828VectorCompareKind::I64x2S => {1829// Do a greater than check and invert the results.1830self.asm1831.xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1832self.asm1833.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1834self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1835}1836VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1837// Set the `rhs` vector to the signed minimum values and then1838// compare them with `lhs` for equality.1839self.asm1840.xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1841self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1842}1843VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1844self.asm1845.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)1846}1847}1848Ok(())1849}18501851fn v128_gt(1852&mut self,1853dst: WritableReg,1854lhs: Reg,1855rhs: Reg,1856kind: VectorCompareKind,1857) -> Result<()> {1858self.ensure_has_avx()?;18591860match kind {1861VectorCompareKind::I8x16S1862| VectorCompareKind::I16x8S1863| VectorCompareKind::I32x4S1864| VectorCompareKind::I64x2S => {1865self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())1866}1867VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1868// Set `lhs` to max values, check for equality, then invert the1869// result.1870// If `lhs` is larger, then equality check will fail and result1871// will be inverted to true. Otherwise the equality check will1872// pass and be inverted to false.1873self.asm1874.xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1875self.asm1876.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1877self.asm1878.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1879self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1880}1881VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1882// Do a less than comparison with the operands swapped.1883self.asm1884.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)1885}1886}1887Ok(())1888}18891890fn v128_ge(1891&mut self,1892dst: WritableReg,1893lhs: Reg,1894rhs: Reg,1895kind: VectorCompareKind,1896) -> Result<()> {1897self.ensure_has_avx()?;18981899match kind {1900VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {1901// Set each lane to maximum value and then compare for equality.1902self.asm1903.xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1904self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1905}1906VectorCompareKind::I64x2S => {1907// Perform a greater than comparison with operands swapped,1908// then invert the results.1909self.asm1910.xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());1911self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());1912self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);1913}1914VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1915// Set lanes to maximum values and compare them for equality.1916self.asm1917.xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1918self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1919}1920VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1921// Perform a less than or equal comparison on swapped operands.1922self.asm1923.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)1924}1925}19261927Ok(())1928}19291930fn fence(&mut self) -> Result<()> {1931self.asm.mfence();1932Ok(())1933}19341935fn v128_not(&mut self, dst: WritableReg) -> Result<()> {1936self.ensure_has_avx()?;19371938self.with_scratch::<FloatScratch, _>(|masm, tmp| {1939// First, we initialize `tmp` with all ones by comparing it with1940// itself.1941masm.asm1942.xmm_vpcmpeq_rrr(tmp.writable(), tmp.inner(), tmp.inner(), OperandSize::S32);1943// Then we `xor` tmp and `dst` together, yielding `!dst`.1944masm.asm.xmm_vpxor_rrr(tmp.inner(), dst.to_reg(), dst);1945Ok(())1946})1947}19481949fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1950self.ensure_has_avx()?;1951self.asm.xmm_vpand_rrr(src1, src2, dst);1952Ok(())1953}19541955fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1956self.ensure_has_avx()?;1957self.asm.xmm_vpandn_rrr(src1, src2, dst);1958Ok(())1959}19601961fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1962self.ensure_has_avx()?;1963self.asm.xmm_vpor_rrr(dst, src1, src2);1964Ok(())1965}19661967fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1968self.ensure_has_avx()?;1969self.asm.xmm_vpxor_rrr(src1, src2, dst);1970Ok(())1971}19721973fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {1974self.ensure_has_avx()?;19751976self.with_scratch::<FloatScratch, _>(|masm, tmp| {1977masm.v128_and(src1, mask, tmp.writable())?;1978masm.v128_and_not(mask, src2, dst)?;1979masm.v128_or(dst.to_reg(), tmp.inner(), dst)?;1980Ok(())1981})1982}19831984fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {1985self.ensure_has_avx()?;1986self.asm.xmm_vptest(src, src);1987self.asm.setcc(IntCmpKind::Ne, dst);1988Ok(())1989}19901991fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {1992self.ensure_has_avx()?;1993match kind {1994V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),1995V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),1996V128ConvertKind::I32x4U => {1997self.with_scratch::<FloatScratch, _>(|masm, scratch| {1998// Split each 32-bit integer into 16-bit parts.1999// `scratch` will contain the low bits and `dst` will contain2000// the high bits.2001masm.asm2002.xmm_vpsll_rri(src, scratch.writable(), 0x10, kind.src_lane_size());2003masm.asm.xmm_vpsrl_rri(2004scratch.inner(),2005scratch.writable(),20060x10,2007kind.src_lane_size(),2008);2009masm.asm2010.xmm_vpsub_rrr(src, scratch.inner(), dst, kind.src_lane_size());20112012// Convert the low bits in `scratch` to floating point numbers.2013masm.asm2014.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);20152016// Prevent overflow by right shifting high bits.2017masm.asm2018.xmm_vpsrl_rri(dst.to_reg(), dst, 1, kind.src_lane_size());2019// Convert high bits in `dst` to floating point numbers.2020masm.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);2021// Double high bits in `dst` to reverse right shift.2022masm.asm2023.xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());2024// Add high bits in `dst` to low bits in `scratch`.2025masm.asm.xmm_vaddp_rrr(2026dst.to_reg(),2027scratch.inner(),2028dst,2029kind.src_lane_size(),2030);2031});2032}2033V128ConvertKind::I32x4LowU => {2034// See2035// https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L36682036// for details on the Cranelift AVX implementation.2037// Use `vunpcklp` to create doubles from the integers.2038// Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers2039// creates a byte array for a double that sets the mantissa2040// bits to the original integer value.2041let conversion_constant = self2042.asm2043.add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);2044self.asm2045.xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());2046// Subtract the 0x1.0p52 added above.2047let conversion_constant = self.asm.add_constant(&[20480x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,20490x00, 0x30, 0x43,2050]);2051self.asm.xmm_vsub_rrm(2052dst.to_reg(),2053&conversion_constant,2054dst,2055kind.dst_lane_size(),2056);2057}2058}2059Ok(())2060}20612062fn v128_narrow(2063&mut self,2064src1: Reg,2065src2: Reg,2066dst: WritableReg,2067kind: V128NarrowKind,2068) -> Result<()> {2069self.ensure_has_avx()?;2070match kind {2071V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {2072self.asm2073.xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())2074}2075V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {2076self.asm2077.xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())2078}2079}2080Ok(())2081}20822083fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {2084self.ensure_has_avx()?;2085self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);2086Ok(())2087}20882089fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {2090self.ensure_has_avx()?;2091self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);2092Ok(())2093}20942095fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {2096self.ensure_has_avx()?;2097match kind {2098V128ExtendKind::LowI8x16S2099| V128ExtendKind::LowI8x16U2100| V128ExtendKind::LowI16x8S2101| V128ExtendKind::LowI16x8U2102| V128ExtendKind::LowI32x4S2103| V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),2104V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {2105self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);2106self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());2107}2108V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {2109self.with_scratch::<FloatScratch, _>(|masm, scratch| {2110masm.asm2111.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());2112masm.asm2113.xmm_vpunpckh_rrr(src, scratch.inner(), dst, kind.src_lane_size());2114});2115}2116V128ExtendKind::HighI32x4S => {2117// Move the 3rd element (i.e., 0b10) to the 1st (rightmost)2118// position and the 4th element (i.e., 0b11) to the 2nd (second2119// from the right) position and then perform the extend.2120self.asm2121.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());2122self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());2123}2124V128ExtendKind::HighI32x4U => {2125self.with_scratch::<FloatScratch, _>(|masm, scratch| {2126// Set `scratch` to a vector 0s.2127masm.asm.xmm_vxorp_rrr(2128scratch.inner(),2129scratch.inner(),2130scratch.writable(),2131kind.src_lane_size(),2132);2133// Interleave the 0 bits into the two 32-bit integers to zero extend them.2134masm.asm2135.xmm_vunpckhp_rrr(src, scratch.inner(), dst, kind.src_lane_size());2136});2137}2138}2139Ok(())2140}21412142fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {2143self.ensure_has_avx()?;2144match kind {2145V128AddKind::F32x4 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S32),2146V128AddKind::F64x2 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S64),2147V128AddKind::I8x16 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S8),2148V128AddKind::I8x16SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S8),2149V128AddKind::I8x16SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S8),2150V128AddKind::I16x8 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S16),2151V128AddKind::I16x8SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S16),2152V128AddKind::I16x8SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S16),2153V128AddKind::I32x4 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S32),2154V128AddKind::I64x2 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S64),2155};2156Ok(())2157}21582159fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {2160self.ensure_has_avx()?;2161match kind {2162V128SubKind::F32x4 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S32),2163V128SubKind::F64x2 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S64),2164V128SubKind::I8x16 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S8),2165V128SubKind::I8x16SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S8),2166V128SubKind::I8x16SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S8),2167V128SubKind::I16x8 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S16),2168V128SubKind::I16x8SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S16),2169V128SubKind::I16x8SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S16),2170V128SubKind::I32x4 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S32),2171V128SubKind::I64x2 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S64),2172};2173Ok(())2174}21752176fn v128_mul(2177&mut self,2178context: &mut CodeGenContext<Emission>,2179kind: V128MulKind,2180) -> Result<()> {2181self.ensure_has_avx()?;21822183let rhs = context.pop_to_reg(self, None)?;2184let lhs = context.pop_to_reg(self, None)?;21852186let mul_i64x2_avx512 = |this: &mut Self| {2187this.asm.vpmullq(lhs.reg, rhs.reg, writable!(lhs.reg));2188};21892190let mul_i64x2_fallback = |this: &mut Self,2191context: &mut CodeGenContext<Emission>|2192-> Result<()> {2193// Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback2194// to an instruction sequence using 32bits multiplication (taken from cranelift2195// implementation, in `isa/x64/lower.isle`):2196//2197// > Otherwise, for i64x2 multiplication we describe a lane A as being composed of2198// > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand2199// > multiplication can then be written as:2200//2201// > Ah Al2202// > * Bh Bl2203// > -----2204// > Al * Bl2205// > + (Ah * Bl) << 322206// > + (Al * Bh) << 322207//2208// > So for each lane we will compute:2209//2210// > A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 322211//2212// > Note, the algorithm will use `pmuludq` which operates directly on the lower2213// > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of2214// > the lane of the destination. For this reason we don't need shifts to isolate2215// > the lower 32-bits, however, we will need to use shifts to isolate the high2216// > 32-bits when doing calculations, i.e., `Ah == A >> 32`.22172218let tmp2 = context.any_fpr(this)?;2219this.with_scratch::<FloatScratch, _>(|this, tmp1| {2220// tmp1 = lhs_hi = (lhs >> 32)2221this.asm2222.xmm_vpsrl_rri(lhs.reg, tmp1.writable(), 32, OperandSize::S64);22232224// tmp2 = lhs_hi * rhs_low = tmp1 * rhs2225this.asm2226.xmm_vpmuldq_rrr(tmp1.inner(), rhs.reg, writable!(tmp2));22272228// tmp1 = rhs_hi = rhs >> 322229this.asm2230.xmm_vpsrl_rri(rhs.reg, tmp1.writable(), 32, OperandSize::S64);22312232// tmp1 = lhs_low * rhs_high = tmp1 * lhs2233this.asm2234.xmm_vpmuludq_rrr(tmp1.inner(), lhs.reg, tmp1.writable());22352236// tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp22237this.asm2238.xmm_vpadd_rrr(tmp1.inner(), tmp2, tmp1.writable(), OperandSize::S64);22392240//tmp1 = tmp1 << 322241this.asm2242.xmm_vpsll_rri(tmp1.inner(), tmp1.writable(), 32, OperandSize::S64);22432244// tmp2 = lhs_lo + rhs_lo2245this.asm.xmm_vpmuludq_rrr(lhs.reg, rhs.reg, writable!(tmp2));22462247// finally, with `lhs` as destination:2248// lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp22249this.asm2250.xmm_vpadd_rrr(tmp1.inner(), tmp2, writable!(lhs.reg), OperandSize::S64);2251});22522253context.free_reg(tmp2);22542255Ok(())2256};22572258match kind {2259V128MulKind::F32x4 => {2260self.asm2261.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)2262}2263V128MulKind::F64x2 => {2264self.asm2265.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S64)2266}2267V128MulKind::I16x8 => {2268self.asm2269.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S16)2270}2271V128MulKind::I32x4 => {2272self.asm2273.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)2274}2275// This is the fast path when AVX512 is available.2276V128MulKind::I64x22277if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>2278{2279mul_i64x2_avx512(self)2280}2281// Otherwise, we emit AVX fallback sequence.2282V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,2283}22842285context.stack.push(lhs.into());2286context.free_reg(rhs);22872288Ok(())2289}22902291fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {2292self.ensure_has_avx()?;22932294match kind {2295V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {2296self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())2297}2298V128AbsKind::I64x2 => {2299self.with_scratch::<FloatScratch, _>(|masm, scratch| {2300// Perform an arithmetic right shift of 31 bits. If the number2301// is positive, this will result in all zeroes in the upper2302// 32-bits. If the number is negative, this will result in all2303// ones in the upper 32-bits.2304masm.asm2305.xmm_vpsra_rri(src, scratch.writable(), 0x1f, OperandSize::S32);2306// Copy the ones and zeroes in the high bits of each 64-bit2307// lane to the low bits of each 64-bit lane.2308masm.asm.xmm_vpshuf_rr(2309scratch.inner(),2310scratch.writable(),23110b11_11_01_01,2312OperandSize::S32,2313);2314// Flip the bits in lanes that were negative in `src` and leave2315// the positive lanes as they are. Positive lanes will have a2316// zero mask in `scratch` so xor doesn't affect them.2317masm.asm.xmm_vpxor_rrr(src, scratch.inner(), dst);2318// Subtract the mask from the results of xor which will2319// complete the two's complement for lanes which were negative.2320masm.asm2321.xmm_vpsub_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());2322});2323}2324V128AbsKind::F32x4 | V128AbsKind::F64x2 => {2325self.with_scratch::<FloatScratch, _>(|masm, scratch| {2326// Create a mask of all ones.2327masm.asm.xmm_vpcmpeq_rrr(2328scratch.writable(),2329scratch.inner(),2330scratch.inner(),2331kind.lane_size(),2332);2333// Right shift the mask so each lane is a single zero followed2334// by all ones.2335masm.asm.xmm_vpsrl_rri(2336scratch.inner(),2337scratch.writable(),23380x1,2339kind.lane_size(),2340);2341// Use the mask to zero the sign bit in each lane which will2342// make the float value positive.2343masm.asm2344.xmm_vandp_rrr(src, scratch.inner(), dst, kind.lane_size());2345});2346}2347}2348Ok(())2349}23502351fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {2352self.ensure_has_avx()?;23532354match kind {2355V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {2356self.with_scratch::<FloatScratch, _>(|masm, tmp| {2357masm.v128_xor(tmp.inner(), tmp.inner(), tmp.writable())?;2358masm.v128_sub(tmp.inner(), op.to_reg(), op, kind.into())?;2359wasmtime_environ::error::Ok(())2360})?;2361}2362V128NegKind::F32x4 | V128NegKind::F64x2 => {2363self.with_scratch::<FloatScratch, _>(|masm, tmp| {2364// Create a mask of all 1s.2365masm.asm.xmm_vpcmpeq_rrr(2366tmp.writable(),2367tmp.inner(),2368tmp.inner(),2369kind.lane_size(),2370);2371// Left shift the lanes in the mask so only the sign bit in the2372// mask is set to 1.2373masm.asm.xmm_vpsll_rri(2374tmp.inner(),2375tmp.writable(),2376(kind.lane_size().num_bits() - 1) as u32,2377kind.lane_size(),2378);2379// Use the mask to flip the sign bit.2380masm.asm2381.xmm_vxorp_rrr(op.to_reg(), tmp.inner(), op, kind.lane_size());2382});2383}2384}2385Ok(())2386}23872388fn v128_shift(2389&mut self,2390context: &mut CodeGenContext<Emission>,2391lane_width: OperandSize,2392kind: ShiftKind,2393) -> Result<()> {2394self.ensure_has_avx()?;2395let shift_amount = context.pop_to_reg(self, None)?.reg;2396let operand = context.pop_to_reg(self, None)?.reg;2397let amount_mask = lane_width.num_bits() - 1;23982399self.and(2400writable!(shift_amount),2401shift_amount,2402RegImm::i32(amount_mask as i32),2403OperandSize::S32,2404)?;24052406let move_to_tmp_xmm = |this: &mut Self, tmp_xmm: Scratch| {2407this.asm2408.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);2409};24102411// A helper for deciding between `vpsllw` and `vpsrlw` in2412// `shift_i8x16`.2413enum Direction {2414Left,2415Right,2416}24172418let shift_i8x16 = |this: &mut Self,2419masks: &'static [u8],2420direction: Direction|2421-> Result<()> {2422// The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit2423// shift instruction. Instead, we shift as 16bits, and then mask the bits in the2424// 8bits lane, for example (with 2 8bits lanes):2425// - Before shifting:2426// 01001101 111011102427// - shifting by 2 left:2428// 00110111 101110002429// ^^_ these bits come from the previous byte, and need to be masked.2430// - The mask:2431// 11111100 111111112432// - After masking:2433// 00110100 101110002434//2435// The mask is loaded from a well known memory, depending on the shift amount.24362437this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {2438this.asm2439.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);24402441// Perform the 16-bit shift.2442match direction {2443Direction::Left => this.asm.xmm_vpsll_rrr(2444operand,2445tmp_xmm.inner(),2446writable!(operand),2447OperandSize::S16,2448),2449Direction::Right => this.asm.xmm_vpsrl_rrr(2450operand,2451tmp_xmm.inner(),2452writable!(operand),2453OperandSize::S16,2454),2455}24562457// Get a handle to the masks array constant.2458let masks_addr = this.asm.add_constant(masks);24592460this.with_scratch::<IntScratch, _>(|this, tmp| {2461// Load the masks array effective address into the tmp register.2462this.asm.lea(&masks_addr, tmp.writable(), OperandSize::S64);24632464// Compute the offset of the mask that we need to use. This is shift_amount * 16 ==2465// shift_amount << 4.2466this.asm2467.shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);24682469// Load the mask to tmp_xmm.2470this.asm.xmm_vmovdqu_mr(2471&Address::ImmRegRegShift {2472simm32: 0,2473base: tmp.inner(),2474index: shift_amount,2475shift: 0,2476},2477tmp_xmm.writable(),2478MemFlags::trusted(),2479);2480});24812482// Mask unwanted bits from operand.2483this.asm2484.xmm_vpand_rrr(tmp_xmm.inner(), operand, writable!(operand));2485Ok(())2486})2487};24882489let i64x2_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {2490const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;24912492// AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the2493// following formula (from hacker's delight 2-7), where x is the value and n the shift2494// amount, for each lane:2495// t = (1 << 63) >> n; ((x >> n) ^ t) - t24962497// We need an extra scratch register:2498let tmp_xmm2 = context.any_fpr(this)?;24992500this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {2501this.asm2502.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);25032504let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());25052506this.asm2507.xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());2508this.asm.xmm_vpsrl_rrr(2509tmp_xmm2,2510tmp_xmm.inner(),2511writable!(tmp_xmm2),2512OperandSize::S64,2513);2514this.asm.xmm_vpsrl_rrr(2515operand,2516tmp_xmm.inner(),2517writable!(operand),2518OperandSize::S64,2519);2520});2521this.asm2522.xmm_vpxor_rrr(operand, tmp_xmm2, writable!(operand));2523this.asm2524.xmm_vpsub_rrr(operand, tmp_xmm2, writable!(operand), OperandSize::S64);25252526context.free_reg(tmp_xmm2);25272528Ok(())2529};25302531let i8x16_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {2532// Since the x86 instruction set does not have an 8x16 shift instruction and the2533// approach used for `ishl` and `ushr` cannot be easily used (the masks do not2534// preserve the sign), we use a different approach here: separate the low and2535// high lanes, shift them separately, and merge them into the final result.2536//2537// Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,2538// s15]:2539//2540// lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]2541// shifted_lo.i16x8 = shift each lane of `low`2542// hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]2543// shifted_hi.i16x8 = shift each lane of `high`2544// result = [s0'', s1'', ..., s15'']25452546// In order for `packsswb` later to only use the high byte of each2547// 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to2548// fill in the upper bits appropriately.2549let tmp_lo = context.any_fpr(this)?;2550let tmp_hi = context.any_fpr(this)?;25512552this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {2553this.asm2554.add_ir(8, writable!(shift_amount), OperandSize::S32);2555this.asm2556.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);25572558// Extract lower and upper bytes.2559this.asm2560.xmm_vpunpckl_rrr(operand, operand, writable!(tmp_lo), OperandSize::S8);2561this.asm2562.xmm_vpunpckh_rrr(operand, operand, writable!(tmp_hi), OperandSize::S8);25632564// Perform 16bit right shift of upper and lower bytes.2565this.asm.xmm_vpsra_rrr(2566tmp_lo,2567tmp_xmm.inner(),2568writable!(tmp_lo),2569OperandSize::S16,2570);2571this.asm.xmm_vpsra_rrr(2572tmp_hi,2573tmp_xmm.inner(),2574writable!(tmp_hi),2575OperandSize::S16,2576);2577});25782579// Merge lower and upper bytes back.2580this.asm2581.xmm_vpackss_rrr(tmp_lo, tmp_hi, writable!(operand), OperandSize::S8);25822583context.free_reg(tmp_lo);2584context.free_reg(tmp_hi);25852586Ok(())2587};25882589match (lane_width, kind) {2590// shl2591(OperandSize::S8, ShiftKind::Shl) => {2592shift_i8x16(self, &I8X16_ISHL_MASKS, Direction::Left)?2593}2594(OperandSize::S16, ShiftKind::Shl) => {2595self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2596move_to_tmp_xmm(masm, tmp_xmm);2597masm.asm.xmm_vpsll_rrr(2598operand,2599tmp_xmm.inner(),2600writable!(operand),2601OperandSize::S16,2602);2603})2604}2605(OperandSize::S32, ShiftKind::Shl) => {2606self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2607move_to_tmp_xmm(masm, tmp_xmm);2608masm.asm.xmm_vpsll_rrr(2609operand,2610tmp_xmm.inner(),2611writable!(operand),2612OperandSize::S32,2613);2614})2615}2616(OperandSize::S64, ShiftKind::Shl) => {2617self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2618move_to_tmp_xmm(masm, tmp_xmm);2619masm.asm.xmm_vpsll_rrr(2620operand,2621tmp_xmm.inner(),2622writable!(operand),2623OperandSize::S64,2624);2625})2626}2627// shr_u2628(OperandSize::S8, ShiftKind::ShrU) => {2629shift_i8x16(self, &I8X16_USHR_MASKS, Direction::Right)?2630}2631(OperandSize::S16, ShiftKind::ShrU) => {2632self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2633move_to_tmp_xmm(masm, tmp_xmm);2634masm.asm.xmm_vpsrl_rrr(2635operand,2636tmp_xmm.inner(),2637writable!(operand),2638OperandSize::S16,2639);2640})2641}2642(OperandSize::S32, ShiftKind::ShrU) => {2643self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2644move_to_tmp_xmm(masm, tmp_xmm);2645masm.asm.xmm_vpsrl_rrr(2646operand,2647tmp_xmm.inner(),2648writable!(operand),2649OperandSize::S32,2650);2651})2652}2653(OperandSize::S64, ShiftKind::ShrU) => {2654self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2655move_to_tmp_xmm(masm, tmp_xmm);2656masm.asm.xmm_vpsrl_rrr(2657operand,2658tmp_xmm.inner(),2659writable!(operand),2660OperandSize::S64,2661);2662})2663}2664// shr_s2665(OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(self, context)?,2666(OperandSize::S16, ShiftKind::ShrS) => {2667self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2668move_to_tmp_xmm(masm, tmp_xmm);2669masm.asm.xmm_vpsra_rrr(2670operand,2671tmp_xmm.inner(),2672writable!(operand),2673OperandSize::S16,2674);2675})2676}2677(OperandSize::S32, ShiftKind::ShrS) => {2678self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2679move_to_tmp_xmm(masm, tmp_xmm);2680masm.asm.xmm_vpsra_rrr(2681operand,2682tmp_xmm.inner(),2683writable!(operand),2684OperandSize::S32,2685);2686})2687}2688(OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(self, context)?,26892690_ => bail!(CodeGenError::invalid_operand_combination()),2691}26922693context.free_reg(shift_amount);2694context2695.stack2696.push(TypedReg::new(WasmValType::V128, operand).into());2697Ok(())2698}26992700fn v128_q15mulr_sat_s(2701&mut self,2702lhs: Reg,2703rhs: Reg,2704dst: WritableReg,2705size: OperandSize,2706) -> Result<()> {2707self.ensure_has_avx()?;27082709self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);27102711// Need to handle edge case of multiplying -1 by -1 (0x8000 in Q152712// format) because of how `vpmulhrs` handles rounding. `vpmulhrs`2713// produces 0x8000 in that case when the correct result is 0x7FFF (that2714// is, +1) so need to check if the result is 0x8000 and flip the bits2715// of the result if it is.2716let address = self.asm.add_constant(&[27170x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,27180x00, 0x80,2719]);2720self.asm2721.xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);2722self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);2723Ok(())2724}27252726fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {2727self.ensure_has_avx()?;27282729self.with_scratch::<FloatScratch, _>(|masm, scratch| {2730// Create a mask of all 0s.2731masm.asm2732.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());2733// Sets lane in `dst` to not zero if `src` lane was zero, and lane in2734// `dst` to zero if `src` lane was not zero.2735masm.asm2736.xmm_vpcmpeq_rrr(writable!(src), src, scratch.inner(), size);2737// Sets ZF if all values are zero (i.e., if all original values were not zero).2738masm.asm.xmm_vptest(src, src);2739// Set byte if ZF=1.2740});2741self.asm.setcc(IntCmpKind::Eq, dst);2742Ok(())2743}27442745fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {2746self.ensure_has_avx()?;27472748match size {2749OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),2750OperandSize::S16 => {2751// Signed conversion of 16-bit integers to 8-bit integers.2752self.asm2753.xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);2754// Creates a mask from each byte in `src`.2755self.asm2756.xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);2757// Removes 8 bits added as a result of the `vpackss` step.2758self.asm2759.shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);2760}2761OperandSize::S32 | OperandSize::S64 => {2762self.asm.xmm_vmovskp_rr(src, dst, size, OperandSize::S32)2763}2764_ => unimplemented!(),2765}27662767Ok(())2768}27692770fn v128_trunc(2771&mut self,2772context: &mut CodeGenContext<Emission>,2773kind: V128TruncKind,2774) -> Result<()> {2775self.ensure_has_avx()?;27762777let reg = writable!(context.pop_to_reg(self, None)?.reg);2778match kind {2779V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(2780reg.to_reg(),2781reg,2782VroundMode::TowardZero,2783kind.dst_lane_size(),2784),2785V128TruncKind::I32x4FromF32x4S => {2786self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size())?;2787}2788V128TruncKind::I32x4FromF32x4U => {2789let temp_reg = writable!(context.any_fpr(self)?);2790self.v128_trunc_sat_f32x4_u(2791reg,2792temp_reg,2793kind.src_lane_size(),2794kind.dst_lane_size(),2795)?;2796context.free_reg(temp_reg.to_reg());2797}2798V128TruncKind::I32x4FromF64x2SZero => {2799self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size())?;2800}2801V128TruncKind::I32x4FromF64x2UZero => {2802self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size())?;2803}2804}28052806context.stack.push(TypedReg::v128(reg.to_reg()).into());2807Ok(())2808}28092810fn v128_min(2811&mut self,2812src1: Reg,2813src2: Reg,2814dst: WritableReg,2815kind: V128MinKind,2816) -> Result<()> {2817self.ensure_has_avx()?;28182819match kind {2820V128MinKind::I8x16S2821| V128MinKind::I8x16U2822| V128MinKind::I16x8S2823| V128MinKind::I16x8U2824| V128MinKind::I32x4S2825| V128MinKind::I32x4U => {2826match kind {2827V128MinKind::I8x16S => {2828self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S8)2829}2830V128MinKind::I8x16U => {2831self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S8)2832}2833V128MinKind::I16x8S => {2834self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S16)2835}2836V128MinKind::I16x8U => {2837self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S16)2838}2839V128MinKind::I32x4S => {2840self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S32)2841}2842V128MinKind::I32x4U => {2843self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S32)2844}2845_ => unreachable!(),2846};2847}2848V128MinKind::F32x4 | V128MinKind::F64x2 => {2849self.with_scratch::<FloatScratch, _>(|masm, scratch| {2850// Handling +0 and -0 as well as NaN values are not commutative2851// when using `vminp` so we have to compensate.2852// Perform two comparison operations with the operands swapped2853// and OR the result to propagate 0 (positive and negative) and2854// NaN.2855masm.asm2856.xmm_vminp_rrr(src1, src2, scratch.writable(), kind.lane_size());2857masm.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());2858// Use a single OR instruction to set the sign bit if either2859// result has the sign bit set to correctly propagate -0.2860masm.asm2861.xmm_vorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());2862});2863// Set lanes with NaN to all 1s.2864self.asm.xmm_vcmpp_rrr(2865writable!(src2),2866src2,2867dst.to_reg(),2868kind.lane_size(),2869VcmpKind::Unord,2870);2871// Doesn't change non-NaN values. For NaN values, sets all bits.2872self.asm2873.xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());2874self.canonicalize_nans(writable!(src2), dst, kind.lane_size());2875}2876}28772878Ok(())2879}28802881fn v128_max(2882&mut self,2883src1: Reg,2884src2: Reg,2885dst: WritableReg,2886kind: V128MaxKind,2887) -> Result<()> {2888self.ensure_has_avx()?;28892890match kind {2891V128MaxKind::I8x16S2892| V128MaxKind::I8x16U2893| V128MaxKind::I16x8S2894| V128MaxKind::I16x8U2895| V128MaxKind::I32x4S2896| V128MaxKind::I32x4U => {2897match kind {2898V128MaxKind::I8x16S => {2899self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S8)2900}2901V128MaxKind::I8x16U => {2902self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S8)2903}2904V128MaxKind::I16x8S => {2905self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S16)2906}2907V128MaxKind::I16x8U => {2908self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S16)2909}2910V128MaxKind::I32x4S => {2911self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S32)2912}2913V128MaxKind::I32x4U => {2914self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S32)2915}2916_ => unreachable!(),2917};2918}2919V128MaxKind::F32x4 | V128MaxKind::F64x2 => {2920self.with_scratch::<FloatScratch, _>(|masm, scratch| {2921// Handling +0 and -0 as well as NaN values are not commutative2922// when using `vmaxp` so we have to compensate.2923// Perform two comparison operations with the operands swapped2924// so we can propagate 0 (positive and negative) and NaNs2925// correctly.29262927masm.asm2928.xmm_vmaxp_rrr(src1, src2, scratch.writable(), kind.lane_size());2929masm.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());2930// This combination of XOR, OR, and SUB will set the sign bit2931// on a 0 result to the correct value for a max operation.2932masm.asm2933.xmm_vxorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());2934masm.asm.xmm_vorp_rrr(2935dst.to_reg(),2936scratch.inner(),2937writable!(src2),2938kind.lane_size(),2939);2940});2941self.asm2942.xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());2943// Set lanes of NaN values to 1.2944self.asm.xmm_vcmpp_rrr(2945writable!(src2),2946src2,2947src2,2948kind.lane_size(),2949VcmpKind::Unord,2950);2951self.canonicalize_nans(writable!(src2), dst, kind.lane_size());2952}2953}2954Ok(())2955}29562957fn v128_extmul(2958&mut self,2959context: &mut CodeGenContext<Emission>,2960kind: V128ExtMulKind,2961) -> Result<()> {2962self.ensure_has_avx()?;29632964// The implementation for extmul is not optimized; for simplicity's sake, we simply perform2965// an extension followed by a multiplication using already implemented primitives.29662967let src1 = context.pop_to_reg(self, None)?;2968let src2 = context.pop_to_reg(self, None)?;29692970let ext_kind = kind.into();2971self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;2972self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;29732974context.stack.push(src2.into());2975context.stack.push(src1.into());29762977self.v128_mul(context, kind.into())2978}29792980fn v128_extadd_pairwise(2981&mut self,2982src: Reg,2983dst: WritableReg,2984kind: V128ExtAddKind,2985) -> Result<()> {2986self.ensure_has_avx()?;29872988match kind {2989V128ExtAddKind::I8x16S => {2990self.with_scratch::<FloatScratch, _>(|masm, scratch| {2991// Use `vpmaddubsw` with a vector of 16 8-bit 1's which will2992// sign extend `src` to 16 bits and add adjacent words.2993// Need to supply constant as first operand since first operand2994// is treated as unsigned and the second operand is signed.2995let mask = masm.asm.add_constant(&[1; 16]);2996masm.asm.xmm_mov_mr(2997&mask,2998scratch.writable(),2999OperandSize::S128,3000MemFlags::trusted(),3001);3002masm.asm.xmm_vpmaddubsw_rrr(scratch.inner(), src, dst);3003});3004}3005V128ExtAddKind::I8x16U => {3006// Same approach as the signed variant but treat `src` as3007// unsigned instead of signed by passing it as the first3008// operand.3009let mask = self.asm.add_constant(&[1; 16]);3010self.asm.xmm_vpmaddubsw_rmr(src, &mask, dst);3011}3012V128ExtAddKind::I16x8S => {3013// Similar approach to the two variants above. The vector is 83014// lanes of 16-bit 1's and `vpmaddwd` treats both operands as3015// signed.3016let mask = self3017.asm3018.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);3019self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);3020}3021V128ExtAddKind::I16x8U => {3022// Similar approach as the signed variant.3023// `vpmaddwd` operates on signed integers and the operand is3024// unsigned so the operand needs to be converted to a signed3025// format and than that process needs to be reversed after3026// `vpmaddwd`.3027// Flip the sign bit for 8 16-bit lanes.3028let xor_mask = self.asm.add_constant(&[30290x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,30300x80, 0x00, 0x80,3031]);3032self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);30333034let madd_mask = self3035.asm3036.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);3037self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);30383039// Reverse the XOR. The XOR effectively subtracts 32,768 from3040// both pairs that are added together so 65,536 (0x10000)3041// needs to be added to 4 lanes of 32-bit values.3042let add_mask = self3043.asm3044.add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);3045self.asm3046.xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);3047}3048}3049Ok(())3050}30513052fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {3053self.ensure_has_avx()?;3054self.asm.xmm_vpmaddwd_rrr(lhs, rhs, dst);3055Ok(())3056}30573058fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {3059self.ensure_has_avx()?;30603061let reg = writable!(context.pop_to_reg(self, None)?.reg);3062let reg2 = writable!(context.any_fpr(self)?);30633064// This works by using a lookup table to determine the count of bits3065// set in the upper 4 bits and lower 4 bits separately and then adding3066// the counts.30673068// A mask to zero out the upper 4 bits in each lane.3069let address = self.asm.add_constant(&[30700x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,30710x0F, 0x0F,3072]);30733074self.with_scratch::<FloatScratch, _>(|masm, scratch| {3075// Zero out the upper 4 bits of each lane.3076masm.asm3077.xmm_vpand_rrm(reg.to_reg(), &address, scratch.writable());3078// Right shift bytes in input by 4 bits to put the upper 4 bits in the3079// lower 4 bits.3080masm.asm3081.xmm_vpsrl_rri(reg.to_reg(), reg, 0x4, OperandSize::S16);3082// Zero out the upper 4 bits of each shifted lane.3083masm.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);30843085// Write a lookup table of 4 bit values to number of bits set to a3086// register so we only perform the memory read once.3087// Index (hex) | Value (binary) | Population Count3088// 0x0 | 0000 | 03089// 0x1 | 0001 | 13090// 0x2 | 0010 | 13091// 0x3 | 0011 | 23092// 0x4 | 0100 | 13093// 0x5 | 0101 | 23094// 0x6 | 0110 | 23095// 0x7 | 0111 | 33096// 0x8 | 1000 | 13097// 0x9 | 1001 | 23098// 0xA | 1010 | 23099// 0xB | 1011 | 33100// 0xC | 1100 | 23101// 0xD | 1101 | 33102// 0xE | 1110 | 33103// 0xF | 1111 | 43104let address = masm.asm.add_constant(&[31050x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,3106]);3107masm.asm3108.xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());3109// Use the upper 4 bits as an index into the lookup table.3110masm.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());3111// Use the lower 4 bits as an index into the lookup table.3112masm.asm3113.xmm_vpshufb_rrr(scratch.writable(), reg2.to_reg(), scratch.inner());3114context.free_reg(reg2.to_reg());31153116// Add the counts of the upper 4 bits and the lower 4 bits to get the3117// total number of bits set.3118masm.asm3119.xmm_vpadd_rrr(reg.to_reg(), scratch.inner(), reg, OperandSize::S8);3120wasmtime_environ::error::Ok(())3121})?;31223123context.stack.push(TypedReg::v128(reg.to_reg()).into());3124Ok(())3125}31263127fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3128self.ensure_has_avx()?;3129self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);3130Ok(())3131}31323133fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3134self.ensure_has_avx()?;3135self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);3136Ok(())3137}31383139fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3140self.ensure_has_avx()?;3141self.asm.xmm_vsqrtp_rr(src, dst, size);3142Ok(())3143}31443145fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3146self.ensure_has_avx()?;3147self.asm3148.xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);3149Ok(())3150}31513152fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3153self.ensure_has_avx()?;3154self.asm3155.xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);3156Ok(())3157}31583159fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3160self.ensure_has_avx()?;3161self.asm3162.xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);3163Ok(())3164}31653166fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3167self.ensure_has_avx()?;3168// Reverse operands since Wasm specifies returning the first operand if3169// either operand is NaN while x86 returns the second operand.3170self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);3171Ok(())3172}31733174fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3175self.ensure_has_avx()?;3176// Reverse operands since Wasm specifies returning the first operand if3177// either operand is NaN while x86 returns the second operand.3178self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);3179Ok(())3180}3181}31823183impl MacroAssembler {3184/// Create an x64 MacroAssembler.3185pub fn new(3186ptr_size: impl PtrSize,3187shared_flags: settings::Flags,3188isa_flags: x64_settings::Flags,3189) -> Result<Self> {3190let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size());31913192Ok(Self {3193sp_offset: 0,3194sp_max: 0,3195stack_max_use_add: None,3196asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),3197flags: isa_flags,3198shared_flags,3199ptr_size: ptr_type.try_into()?,3200scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),3201})3202}32033204/// Add the maximum stack used to a register, recording an obligation to update the3205/// add-with-immediate instruction emitted to use the real stack max when the masm is being3206/// finalized.3207fn add_stack_max(&mut self, reg: Reg) {3208assert!(self.stack_max_use_add.is_none());3209let patch = PatchableAddToReg::new(reg, OperandSize::S64, &mut self.asm);3210self.stack_max_use_add.replace(patch);3211}32123213fn ensure_has_avx(&self) -> Result<()> {3214crate::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);3215Ok(())3216}32173218fn ensure_has_avx2(&self) -> Result<()> {3219crate::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);3220Ok(())3221}32223223fn ensure_has_avx512vl(&self) -> Result<()> {3224crate::ensure!(3225self.flags.has_avx512vl(),3226CodeGenError::UnimplementedForNoAvx512VL3227);3228Ok(())3229}32303231fn ensure_has_avx512dq(&self) -> Result<()> {3232crate::ensure!(3233self.flags.has_avx512dq(),3234CodeGenError::UnimplementedForNoAvx512DQ3235);3236Ok(())3237}32383239fn increment_sp(&mut self, bytes: u32) {3240self.sp_offset += bytes;32413242// NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have3243// seen the entire function, this value will represent the maximum size for the stack3244// frame.3245self.sp_max = self.sp_max.max(self.sp_offset);3246}32473248fn decrement_sp(&mut self, bytes: u32) {3249assert!(3250self.sp_offset >= bytes,3251"sp offset = {}; bytes = {}",3252self.sp_offset,3253bytes3254);3255self.sp_offset -= bytes;3256}32573258fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {3259match constant {3260I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),3261I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),3262I::F32(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),3263I::F64(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),3264I::V128(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),3265}3266}32673268/// A common implementation for zero-extend stack loads.3269fn load_impl(3270&mut self,3271src: Address,3272dst: WritableReg,3273size: OperandSize,3274flags: MemFlags,3275) -> Result<()> {3276if dst.to_reg().is_int() {3277let ext = size.extend_to::<Zero>(OperandSize::S64);3278self.asm.movzx_mr(&src, dst, ext, flags);3279} else {3280self.asm.xmm_mov_mr(&src, dst, size, flags);3281}32823283Ok(())3284}32853286/// A common implementation for stack stores.3287fn store_impl(3288&mut self,3289src: RegImm,3290dst: Address,3291size: OperandSize,3292flags: MemFlags,3293) -> Result<()> {3294let _ = match src {3295RegImm::Imm(imm) => match imm {3296I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),3297I::I64(v) => match v.try_into() {3298Ok(v) => self.asm.mov_im(v, &dst, size, flags),3299Err(_) => {3300// If the immediate doesn't sign extend, use a scratch3301// register.3302self.with_scratch::<IntScratch, _>(|masm, scratch| {3303masm.asm.mov_ir(v, scratch.writable(), size);3304masm.asm.mov_rm(scratch.inner(), &dst, size, flags);3305});3306}3307},3308I::F32(v) => {3309let addr = self.asm.add_constant(v.to_le_bytes().as_slice());3310self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {3311// Always trusted, since we are loading the constant from3312// the constant pool.3313masm.asm.xmm_mov_mr(3314&addr,3315float_scratch.writable(),3316size,3317MemFlags::trusted(),3318);3319masm.asm3320.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);3321});3322}3323I::F64(v) => {3324let addr = self.asm.add_constant(v.to_le_bytes().as_slice());33253326self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {3327// Similar to above, always trusted since we are loading the3328// constant from the constant pool.3329masm.asm.xmm_mov_mr(3330&addr,3331float_scratch.writable(),3332size,3333MemFlags::trusted(),3334);3335masm.asm3336.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);3337});3338}3339I::V128(v) => {3340let addr = self.asm.add_constant(v.to_le_bytes().as_slice());3341self.with_scratch::<FloatScratch, _>(|masm, vector_scratch| {3342// Always trusted, since we are loading the constant from3343// the constant pool.3344masm.asm.xmm_mov_mr(3345&addr,3346vector_scratch.writable(),3347size,3348MemFlags::trusted(),3349);3350masm.asm3351.xmm_mov_rm(vector_scratch.inner(), &dst, size, flags);3352});3353}3354},3355RegImm::Reg(reg) => {3356if reg.is_int() {3357self.asm.mov_rm(reg, &dst, size, flags);3358} else {3359self.asm.xmm_mov_rm(reg, &dst, size, flags);3360}3361}3362};3363Ok(())3364}33653366fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {3367if dst != lhs {3368Err(format_err!(CodeGenError::invalid_two_arg_form()))3369} else {3370Ok(())3371}3372}33733374/// The mask to use when performing a `vpshuf` operation for a 64-bit splat.3375fn vpshuf_mask_for_64_bit_splats() -> u8 {3376// Results in the first 4 bytes and second 4 bytes being3377// swapped and then the swapped bytes being copied.3378// [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields3379// [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3].33800b01_00_01_003381}33823383fn v128_trunc_sat_f32x4_s(3384&mut self,3385reg: WritableReg,3386src_lane_size: OperandSize,3387dst_lane_size: OperandSize,3388) -> Result<()> {3389self.with_scratch::<FloatScratch, _>(|masm, scratch| {3390// Create a mask to handle NaN values (1 for not NaN, 0 for3391// NaN).3392masm.asm.xmm_vcmpp_rrr(3393scratch.writable(),3394reg.to_reg(),3395reg.to_reg(),3396src_lane_size,3397VcmpKind::Eq,3398);3399// Zero out any NaN values.3400masm.asm3401.xmm_vandp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3402// Create a mask for the sign bits.3403masm.asm3404.xmm_vpxor_rrr(scratch.inner(), reg.to_reg(), scratch.writable());3405// Convert floats to integers.3406masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);3407// Apply sign mask to the converted integers.3408masm.asm3409.xmm_vpand_rrr(reg.to_reg(), scratch.inner(), scratch.writable());3410// Create a saturation mask of all 1s for negative numbers,3411// all 0s for positive numbers. The arithmetic shift will cop3412// the sign bit.3413masm.asm3414.xmm_vpsra_rri(scratch.inner(), scratch.writable(), 0x1F, dst_lane_size);3415// Combine converted integers with saturation mask.3416masm.asm.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), reg);3417Ok(())3418})3419}34203421fn v128_trunc_sat_f32x4_u(3422&mut self,3423reg: WritableReg,3424temp_reg: WritableReg,3425src_lane_size: OperandSize,3426dst_lane_size: OperandSize,3427) -> Result<()> {3428self.with_scratch::<FloatScratch, _>(|masm, scratch| {3429// Set scratch to all zeros.3430masm.asm.xmm_vxorp_rrr(3431reg.to_reg(),3432reg.to_reg(),3433scratch.writable(),3434src_lane_size,3435);3436// Clamp negative numbers to 0.3437masm.asm3438.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3439// Create a vector of all 1s.3440masm.asm.xmm_vpcmpeq_rrr(3441scratch.writable(),3442scratch.inner(),3443scratch.inner(),3444src_lane_size,3445);3446// Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by3447// performing a logical shift right.3448masm.asm3449.xmm_vpsrl_rri(scratch.inner(), scratch.writable(), 0x1, src_lane_size);3450// Convert max signed int to float as a reference point for saturation.3451masm.asm3452.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);3453// Convert the floats to integers and put the results in `reg2`.3454// This is signed and not unsigned so we need to handle the3455// value for the high bit in each lane.3456masm.asm3457.xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);3458// Set `reg` lanes to the amount that the value in the lane3459// exceeds the maximum signed 32-bit integer.3460masm.asm3461.xmm_vsub_rrr(reg.to_reg(), scratch.inner(), reg, dst_lane_size);3462// Create mask in `scratch` for numbers that are larger than3463// the maximum signed 32-bit integer. Lanes that don't fit3464// in 32-bits ints will be 1.3465masm.asm.xmm_vcmpp_rrr(3466scratch.writable(),3467scratch.inner(),3468reg.to_reg(),3469dst_lane_size,3470VcmpKind::Le,3471);3472// Convert the excess over signed 32-bits from floats to integers.3473masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);3474// Apply large number mask to excess values which will flip the3475// bits in any lanes that exceed signed 32-bits. Adding this3476// flipped value to the signed value will set the high bit and3477// the carry behavior will update the other bits correctly.3478masm.asm3479.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), scratch.writable());3480// Set `reg` to all 0s.3481masm.asm.xmm_vpxor_rrr(reg.to_reg(), reg.to_reg(), reg);3482// Ensure excess values are not negative by taking max b/w3483// excess values and zero.3484masm.asm3485.xmm_vpmaxs_rrr(reg, scratch.inner(), reg.to_reg(), dst_lane_size);3486});3487// Perform the addition between the signed conversion value (in3488// `reg2`) and the flipped excess value (in `reg`) to get the3489// unsigned value.3490self.asm3491.xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);3492Ok(())3493}34943495fn v128_trunc_sat_f64x2_s_zero(3496&mut self,3497reg: WritableReg,3498src_lane_size: OperandSize,3499) -> Result<()> {3500self.with_scratch::<FloatScratch, _>(|masm, scratch| {3501// Create a NaN mask (1s for non-NaN, 0s for NaN).3502masm.asm.xmm_vcmpp_rrr(3503scratch.writable(),3504reg.to_reg(),3505reg.to_reg(),3506src_lane_size,3507VcmpKind::Eq,3508);3509// Clamp NaN values to maximum 64-bit float that can be3510// converted to an i32.3511let address = masm.asm.add_constant(&[35120x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,35130xDF, 0x41,3514]);3515masm.asm3516.xmm_vandp_rrm(scratch.inner(), &address, scratch.writable(), src_lane_size);3517// Handle the saturation for values too large to fit in an i32.3518masm.asm3519.xmm_vminp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3520// Convert the floats to integers.3521masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);35223523Ok(())3524})3525}35263527fn v128_trunc_sat_f64x2_u_zero(3528&mut self,3529reg: WritableReg,3530src_lane_size: OperandSize,3531dst_lane_size: OperandSize,3532) -> Result<()> {3533self.with_scratch::<FloatScratch, _>(|masm, scratch| {3534// Zero out the scratch register.3535masm.asm.xmm_vxorp_rrr(3536scratch.inner(),3537scratch.inner(),3538scratch.writable(),3539src_lane_size,3540);3541// Clamp negative values to zero.3542masm.asm3543.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3544// Clamp value to maximum unsigned 32-bit integer value3545// (0x41F0000000000000).3546let address = masm.asm.add_constant(&[35470x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,35480xEF, 0x41,3549]);3550masm.asm3551.xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);3552// Truncate floating point values.3553masm.asm3554.xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);3555// Add 2^52 (doubles store 52 bits in their mantissa) to each3556// lane causing values in the lower bits to be shifted into3557// position for integer conversion.3558let address = masm.asm.add_constant(&[35590x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,35600x30, 0x43,3561]);3562masm.asm3563.xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);3564// Takes lanes 0 and 2 from `reg` (converted values) and lanes3565// 0 and 2 from `scratch` (zeroes) to put the converted ints in3566// the lower lanes and zeroes in the upper lanes.3567masm.asm.xmm_vshufp_rrri(3568reg.to_reg(),3569scratch.inner(),3570reg,35710b10_00_10_00,3572dst_lane_size,3573);3574Ok(())3575})3576}35773578/// Given a vector of floats where lanes with NaN values are set to all 1s3579/// in `reg` and a vector register `dst` with a mix of non-NaN values and3580/// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`.3581fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {3582// Canonical NaNs do not preserve the sign bit, have the exponent bits3583// all set, and have only the high bit of the mantissa set so shift by3584// that number.3585// The mask we're producing in this step will be inverted in the next3586// step.3587let amount_to_shift = 1 + size.mantissa_bits() + 1;3588self.asm3589.xmm_vpsrl_rri(mask.to_reg(), mask, amount_to_shift as u32, size);3590// The mask will be inverted by the ANDN so non-NaN values will be all3591// 1s and NaN values will set the sign bit, exponent bits, and zero out3592// almost all of the mantissa.3593self.asm3594.xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);3595}3596}359735983599