Path: blob/main/winch/codegen/src/isa/x64/masm.rs
1693 views
use super::{1RegAlloc,2abi::X64ABI,3address::Address,4asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},5regs::{self, rbp, rsp, scratch_fpr_bitset, scratch_gpr_bitset},6};7use anyhow::{Result, anyhow, bail};89use crate::masm::{10DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, FloatScratch, Imm as I, IntCmpKind,11IntScratch, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm,12RemKind, ReplaceLaneKind, RmwOp, RoundingMode, Scratch, ScratchType, ShiftKind, SplatKind,13StoreKind, TRUSTED_FLAGS, TrapCode, TruncKind, UNTRUSTED_FLAGS, V128AbsKind, V128AddKind,14V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, V128MaxKind, V128MinKind,15V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind, VectorCompareKind,16VectorEqualityKind, Zero,17};18use crate::{19abi::{self, LocalSlot, align_to, calculate_frame_adjustment},20codegen::{CodeGenContext, CodeGenError, Emission, FuncEnv, ptr_type_from_ptr_size},21stack::{TypedReg, Val},22};23use crate::{24abi::{ABI, vmctx},25masm::{SPOffset, StackSlot},26};27use crate::{28isa::{29CallingConvention,30reg::{Reg, RegClass, WritableReg, writable},31},32masm::CalleeKind,33};34use cranelift_codegen::{35Final, MachBufferFinalized, MachLabel,36binemit::CodeOffset,37ir::{MemFlags, RelSourceLoc, SourceLoc},38isa::{39unwind::UnwindInst,40x64::{AtomicRmwSeqOp, args::CC, settings as x64_settings},41},42settings,43};44use wasmtime_cranelift::TRAP_UNREACHABLE;45use wasmtime_environ::{PtrSize, WasmValType};4647// Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`48// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we49// need to fix up the bits that migrate from one half of the lane to the50// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift51// right by 0 (no movement), we want to retain all the bits so we mask with52// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so53// we mask with `0x7f`; etc.5455#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.56const I8X16_ISHL_MASKS: [u8; 128] = [570xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,580xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,590xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,600xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,610xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,620xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,630xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,640x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,65];6667#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.68const I8X16_USHR_MASKS: [u8; 128] = [690xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,700x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,710x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,720x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,730x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,740x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,750x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,760x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,77];7879/// x64 MacroAssembler.80pub(crate) struct MacroAssembler {81/// Stack pointer offset.82sp_offset: u32,83/// This value represents the maximum stack size seen while compiling the function. While the84/// function is still being compiled its value will not be valid (the stack will grow and85/// shrink as space is reserved and freed during compilation), but once all instructions have86/// been seen this value will be the maximum stack usage seen.87sp_max: u32,88/// Add instructions that are used to add the constant stack max to a register.89stack_max_use_add: Option<PatchableAddToReg>,90/// Low level assembler.91asm: Assembler,92/// ISA flags.93flags: x64_settings::Flags,94/// Shared flags.vmcontext_store_context95shared_flags: settings::Flags,96/// The target pointer size.97ptr_size: OperandSize,98/// Scratch register scope.99scratch_scope: RegAlloc,100}101102impl Masm for MacroAssembler {103type Address = Address;104type Ptr = u8;105type ABI = X64ABI;106107fn frame_setup(&mut self) -> Result<()> {108let frame_pointer = rbp();109let stack_pointer = rsp();110111self.asm.push_r(frame_pointer);112113if self.shared_flags.unwind_info() {114self.asm.unwind_inst(UnwindInst::PushFrameRegs {115offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),116})117}118119self.asm120.mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);121122Ok(())123}124125fn check_stack(&mut self, vmctx: Reg) -> Result<()> {126let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();127128self.with_scratch::<IntScratch, _>(|masm, scratch| {129masm.load_ptr(130masm.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,131scratch.writable(),132)?;133134masm.load_ptr(135Address::offset(136scratch.inner(),137ptr_size.vmstore_context_stack_limit().into(),138),139scratch.writable(),140)?;141142masm.add_stack_max(scratch.inner());143144masm.asm.cmp_rr(scratch.inner(), regs::rsp(), masm.ptr_size);145masm.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);146anyhow::Ok(())147})?;148149// Emit unwind info.150if self.shared_flags.unwind_info() {151self.asm.unwind_inst(UnwindInst::DefineNewFrame {152offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),153154// The Winch calling convention has no callee-save registers, so nothing will be155// clobbered.156offset_downward_to_clobbers: 0,157})158}159Ok(())160}161162fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {163let bytes = match (reg.class(), size) {164(RegClass::Int, OperandSize::S64) => {165let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;166self.asm.push_r(reg);167self.increment_sp(word_bytes);168word_bytes169}170(RegClass::Int, OperandSize::S32) => {171let bytes = size.bytes();172self.reserve_stack(bytes)?;173let sp_offset = SPOffset::from_u32(self.sp_offset);174self.asm175.mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);176bytes177}178(RegClass::Float, _) => {179let bytes = size.bytes();180self.reserve_stack(bytes)?;181let sp_offset = SPOffset::from_u32(self.sp_offset);182self.asm183.xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);184bytes185}186_ => unreachable!(),187};188189Ok(StackSlot {190offset: SPOffset::from_u32(self.sp_offset),191size: bytes,192})193}194195fn reserve_stack(&mut self, bytes: u32) -> Result<()> {196if bytes == 0 {197return Ok(());198}199200self.asm201.sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);202self.increment_sp(bytes);203204Ok(())205}206207fn free_stack(&mut self, bytes: u32) -> Result<()> {208if bytes == 0 {209return Ok(());210}211self.asm212.add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);213self.decrement_sp(bytes);214215Ok(())216}217218fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {219self.sp_offset = offset.as_u32();220221Ok(())222}223224fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {225let (reg, offset) = if local.addressed_from_sp() {226let offset = self227.sp_offset228.checked_sub(local.offset)229.ok_or_else(|| CodeGenError::invalid_local_offset())?;230(rsp(), offset)231} else {232(rbp(), local.offset)233};234235Ok(Address::offset(reg, offset))236}237238fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {239Ok(Address::offset(240regs::rsp(),241self.sp_offset - offset.as_u32(),242))243}244245fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {246Ok(Address::offset(regs::rsp(), offset.as_u32()))247}248249fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {250Ok(Address::offset(vmctx!(Self), offset))251}252253fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {254self.store(src.into(), dst, self.ptr_size)255}256257fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {258self.store_impl(src, dst, size, TRUSTED_FLAGS)259}260261fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {262match kind {263StoreKind::Operand(size) => {264self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;265}266StoreKind::Atomic(size) => {267if size == OperandSize::S128 {268// TODO: we don't support 128-bit atomic store yet.269bail!(CodeGenError::unexpected_operand_size());270}271// To stay consistent with cranelift, we emit a normal store followed by a mfence,272// although, we could probably just emit a xchg.273self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;274self.asm.mfence();275}276StoreKind::VectorLane(LaneSelector { lane, size }) => {277self.ensure_has_avx()?;278self.asm279.xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS);280}281}282283Ok(())284}285286fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {287let current_sp = SPOffset::from_u32(self.sp_offset);288let _ = match (dst.to_reg().class(), size) {289(RegClass::Int, OperandSize::S32) => {290let addr = self.address_from_sp(current_sp)?;291self.asm.movzx_mr(292&addr,293dst,294size.extend_to::<Zero>(OperandSize::S64),295TRUSTED_FLAGS,296);297self.free_stack(size.bytes())?;298}299(RegClass::Int, OperandSize::S64) => {300self.asm.pop_r(dst);301self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);302}303(RegClass::Float, _) | (RegClass::Vector, _) => {304let addr = self.address_from_sp(current_sp)?;305self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);306self.free_stack(size.bytes())?;307}308_ => bail!(CodeGenError::invalid_operand_combination()),309};310Ok(())311}312313fn with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R {314let r = self315.scratch_scope316.reg_for_class(T::reg_class(), &mut |_| Ok(()))317.expect("Scratch register to be available");318319let ret = f(self, Scratch::new(r));320self.scratch_scope.free(r);321ret322}323324fn call(325&mut self,326stack_args_size: u32,327mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,328) -> Result<u32> {329let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();330let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();331let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);332let aligned_args_size = align_to(stack_args_size, alignment);333let total_stack = delta + aligned_args_size;334self.reserve_stack(total_stack)?;335let (callee, cc) = load_callee(self)?;336match callee {337CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),338CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),339};340Ok(total_stack)341}342343fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {344self.load(src, dst, self.ptr_size)345}346347fn compute_addr(348&mut self,349src: Self::Address,350dst: WritableReg,351size: OperandSize,352) -> Result<()> {353self.asm.lea(&src, dst, size);354Ok(())355}356357fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {358self.load_impl(src, dst, size, TRUSTED_FLAGS)359}360361fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {362let size = kind.derive_operand_size();363364match kind {365LoadKind::ScalarExtend(ext) => match ext {366ExtendKind::Signed(ext) => {367self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);368}369ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,370},371LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {372// The guarantees of the x86-64 memory model ensure that `SeqCst`373// loads are equivalent to normal loads.374if kind.is_atomic() && size == OperandSize::S128 {375bail!(CodeGenError::unexpected_operand_size());376}377378self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;379}380LoadKind::VectorExtend(ext) => {381self.ensure_has_avx()?;382self.asm383.xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)384}385LoadKind::Splat(_) => {386self.ensure_has_avx()?;387388if size == OperandSize::S64 {389self.asm390.xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);391self.asm.xmm_vpshuf_rr(392dst.to_reg(),393dst,394Self::vpshuf_mask_for_64_bit_splats(),395OperandSize::S32,396);397} else {398self.asm399.xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);400}401}402LoadKind::VectorLane(LaneSelector { lane, size }) => {403self.ensure_has_avx()?;404self.with_scratch::<IntScratch, _>(|masm, byte_tmp| {405masm.load_impl(src, byte_tmp.writable(), size, UNTRUSTED_FLAGS)?;406masm.asm407.xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp.inner(), lane, size);408anyhow::Ok(())409})?;410}411LoadKind::VectorZero(size) => {412self.ensure_has_avx()?;413self.with_scratch::<IntScratch, _>(|masm, scratch| {414masm.load_impl(src, scratch.writable(), size, UNTRUSTED_FLAGS)?;415masm.asm.avx_gpr_to_xmm(scratch.inner(), dst, size);416anyhow::Ok(())417})?;418}419}420421Ok(())422}423424fn sp_offset(&self) -> Result<SPOffset> {425Ok(SPOffset::from_u32(self.sp_offset))426}427428fn zero(&mut self, reg: WritableReg) -> Result<()> {429self.asm.xor_rr(430reg.to_reg(),431reg,432OperandSize::from_bytes(<Self::ABI>::word_bytes()),433);434Ok(())435}436437fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {438match (src, dst.to_reg()) {439(RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {440(RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),441(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),442_ => bail!(CodeGenError::invalid_operand_combination()),443},444(RegImm::Imm(imm), _) => self.load_constant(&imm, dst, size),445}446}447448fn cmov(449&mut self,450dst: WritableReg,451src: Reg,452cc: IntCmpKind,453size: OperandSize,454) -> Result<()> {455match (src.class(), dst.to_reg().class()) {456(RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),457(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),458_ => Err(anyhow!(CodeGenError::invalid_operand_combination())),459}460}461462fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {463Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;464match (rhs, dst) {465(RegImm::Imm(imm), _) => {466if let Some(v) = imm.to_i32() {467self.asm.add_ir(v, dst, size);468} else {469self.with_scratch::<IntScratch, _>(|masm, scratch| {470masm.load_constant(&imm, scratch.writable(), size)?;471masm.asm.add_rr(scratch.inner(), dst, size);472anyhow::Ok(())473})?;474}475}476477(RegImm::Reg(src), dst) => {478self.asm.add_rr(src, dst, size);479}480}481482Ok(())483}484485fn checked_uadd(486&mut self,487dst: WritableReg,488lhs: Reg,489rhs: RegImm,490size: OperandSize,491trap: TrapCode,492) -> Result<()> {493self.add(dst, lhs, rhs, size)?;494self.asm.trapif(CC::B, trap);495Ok(())496}497498fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {499Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;500match (rhs, dst) {501(RegImm::Imm(imm), reg) => {502if let Some(v) = imm.to_i32() {503self.asm.sub_ir(v, reg, size);504} else {505self.with_scratch::<IntScratch, _>(|masm, scratch| {506masm.load_constant(&imm, scratch.writable(), size)?;507masm.asm.sub_rr(scratch.inner(), reg, size);508anyhow::Ok(())509})?;510}511}512513(RegImm::Reg(src), dst) => {514self.asm.sub_rr(src, dst, size);515}516}517518Ok(())519}520521fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {522Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;523match (rhs, dst) {524(RegImm::Imm(imm), _) => {525if let Some(v) = imm.to_i32() {526self.asm.mul_ir(v, dst, size);527} else {528self.with_scratch::<IntScratch, _>(|masm, scratch| {529masm.load_constant(&imm, scratch.writable(), size)?;530masm.asm.mul_rr(scratch.inner(), dst, size);531anyhow::Ok(())532})?;533}534}535536(RegImm::Reg(src), dst) => {537self.asm.mul_rr(src, dst, size);538}539}540541Ok(())542}543544fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {545Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;546self.asm.xmm_add_rr(rhs, dst, size);547Ok(())548}549550fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {551Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;552self.asm.xmm_sub_rr(rhs, dst, size);553Ok(())554}555556fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {557Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;558self.asm.xmm_mul_rr(rhs, dst, size);559Ok(())560}561562fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {563Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;564self.asm.xmm_div_rr(rhs, dst, size);565Ok(())566}567568fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {569Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;570self.asm.xmm_min_seq(rhs, dst, size);571Ok(())572}573574fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {575Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;576self.asm.xmm_max_seq(rhs, dst, size);577Ok(())578}579580fn float_copysign(581&mut self,582dst: WritableReg,583lhs: Reg,584rhs: Reg,585size: OperandSize,586) -> Result<()> {587Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;588let sign_mask = match size {589OperandSize::S32 => I::I32(0x80000000),590OperandSize::S64 => I::I64(0x8000000000000000),591OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {592bail!(CodeGenError::unexpected_operand_size())593}594};595596self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {597masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {598masm.load_constant(&sign_mask, scratch_gpr.writable(), size)?;599masm.asm600.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);601602// Clear everything except sign bit in src.603masm.asm604.xmm_and_rr(scratch_xmm.inner(), writable!(rhs), size);605606// Clear sign bit in dst using scratch to store result. Then copy the607// result back to dst.608masm.asm609.xmm_andn_rr(dst.to_reg(), scratch_xmm.writable(), size);610masm.asm.xmm_mov_rr(scratch_xmm.inner(), dst, size);611612// Copy sign bit from src to dst.613masm.asm.xmm_or_rr(rhs, dst, size);614Ok(())615})616})617}618619fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {620debug_assert_eq!(dst.to_reg().class(), RegClass::Float);621let mask = match size {622OperandSize::S32 => I::I32(0x80000000),623OperandSize::S64 => I::I64(0x8000000000000000),624OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {625bail!(CodeGenError::unexpected_operand_size())626}627};628self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {629masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {630masm.load_constant(&mask, scratch_gpr.writable(), size)?;631masm.asm632.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);633masm.asm.xmm_xor_rr(scratch_xmm.inner(), dst, size);634Ok(())635})636})637}638639fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {640debug_assert_eq!(dst.to_reg().class(), RegClass::Float);641let mask = match size {642OperandSize::S32 => I::I32(0x7fffffff),643OperandSize::S64 => I::I64(0x7fffffffffffffff),644OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {645bail!(CodeGenError::unexpected_operand_size())646}647};648649self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {650masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {651masm.load_constant(&mask, scratch_gpr.writable(), size)?;652653masm.asm654.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);655masm.asm.xmm_and_rr(scratch_xmm.inner(), dst, size);656Ok(())657})658})659}660661fn float_round<662F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,663>(664&mut self,665mode: RoundingMode,666env: &mut FuncEnv<Self::Ptr>,667context: &mut CodeGenContext<Emission>,668size: OperandSize,669mut fallback: F,670) -> Result<()> {671if self.flags.has_sse41() {672let src = context.pop_to_reg(self, None)?;673self.asm674.xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);675context.stack.push(src.into());676Ok(())677} else {678fallback(env, context, self)679}680}681682fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {683self.asm.sqrt(src, dst, size);684Ok(())685}686687fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {688Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;689match (rhs, dst) {690(RegImm::Imm(imm), _) => {691if let Some(v) = imm.to_i32() {692self.asm.and_ir(v, dst, size);693} else {694self.with_scratch::<IntScratch, _>(|masm, scratch| {695masm.load_constant(&imm, scratch.writable(), size)?;696masm.asm.and_rr(scratch.inner(), dst, size);697anyhow::Ok(())698})?;699}700}701702(RegImm::Reg(src), dst) => {703self.asm.and_rr(src, dst, size);704}705}706707Ok(())708}709710fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {711Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;712match (rhs, dst) {713(RegImm::Imm(imm), _) => {714if let Some(v) = imm.to_i32() {715self.asm.or_ir(v, dst, size);716} else {717self.with_scratch::<IntScratch, _>(|masm, scratch| {718masm.load_constant(&imm, scratch.writable(), size)?;719masm.asm.or_rr(scratch.inner(), dst, size);720anyhow::Ok(())721})?;722}723}724725(RegImm::Reg(src), dst) => {726self.asm.or_rr(src, dst, size);727}728}729730Ok(())731}732733fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {734Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;735match (rhs, dst) {736(RegImm::Imm(imm), _) => {737if let Some(v) = imm.to_i32() {738self.asm.xor_ir(v, dst, size);739} else {740self.with_scratch::<IntScratch, _>(|masm, scratch| {741masm.load_constant(&imm, scratch.writable(), size)?;742masm.asm.xor_rr(scratch.inner(), dst, size);743anyhow::Ok(())744})?;745}746}747748(RegImm::Reg(src), _) => {749self.asm.xor_rr(src, dst, size);750}751}752753Ok(())754}755756fn shift_ir(757&mut self,758dst: WritableReg,759imm: I,760lhs: Reg,761kind: ShiftKind,762size: OperandSize,763) -> Result<()> {764Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;765self.asm766.shift_ir(imm.unwrap_as_u64() as u8, dst, kind, size);767Ok(())768}769770fn shift(771&mut self,772context: &mut CodeGenContext<Emission>,773kind: ShiftKind,774size: OperandSize,775) -> Result<()> {776// Number of bits to shift must be in the CL register.777let src = context.pop_to_reg(self, Some(regs::rcx()))?;778let dst = context.pop_to_reg(self, None)?;779780self.asm781.shift_rr(src.into(), writable!(dst.into()), kind, size);782783context.free_reg(src);784context.stack.push(dst.into());785786Ok(())787}788789fn div(790&mut self,791context: &mut CodeGenContext<Emission>,792kind: DivKind,793size: OperandSize,794) -> Result<()> {795// Allocate rdx:rax.796let rdx = context.reg(regs::rdx(), self)?;797let rax = context.reg(regs::rax(), self)?;798799// Allocate the divisor, which can be any gpr.800let divisor = context.pop_to_reg(self, None)?;801802// Mark rax as allocatable.803context.free_reg(rax);804// Move the top value to rax.805let rax = context.pop_to_reg(self, Some(rax))?;806self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);807808// Free the divisor and rdx.809context.free_reg(divisor);810context.free_reg(rdx);811812// Push the quotient.813context.stack.push(rax.into());814Ok(())815}816817fn rem(818&mut self,819context: &mut CodeGenContext<Emission>,820kind: RemKind,821size: OperandSize,822) -> Result<()> {823// Allocate rdx:rax.824let rdx = context.reg(regs::rdx(), self)?;825let rax = context.reg(regs::rax(), self)?;826827// Allocate the divisor, which can be any gpr.828let divisor = context.pop_to_reg(self, None)?;829830// Mark rax as allocatable.831context.free_reg(rax);832// Move the top value to rax.833let rax = context.pop_to_reg(self, Some(rax))?;834self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);835836// Free the divisor and rax.837context.free_reg(divisor);838context.free_reg(rax);839840// Push the remainder.841context.stack.push(Val::reg(rdx, divisor.ty));842843Ok(())844}845846fn frame_restore(&mut self) -> Result<()> {847debug_assert_eq!(self.sp_offset, 0);848self.asm.pop_r(writable!(rbp()));849self.asm.ret();850Ok(())851}852853fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {854if let Some(patch) = self.stack_max_use_add {855patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());856}857858Ok(self.asm.finalize(base))859}860861fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {862Ok(Address::offset(reg, offset))863}864865fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {866match src2 {867RegImm::Imm(imm) => {868if let Some(v) = imm.to_i32() {869self.asm.cmp_ir(src1, v, size);870} else {871self.with_scratch::<IntScratch, _>(|masm, scratch| {872masm.load_constant(&imm, scratch.writable(), size)?;873masm.asm.cmp_rr(src1, scratch.inner(), size);874anyhow::Ok(())875})?;876}877}878RegImm::Reg(src2) => {879self.asm.cmp_rr(src1, src2, size);880}881}882883Ok(())884}885886fn cmp_with_set(887&mut self,888dst: WritableReg,889src: RegImm,890kind: IntCmpKind,891size: OperandSize,892) -> Result<()> {893self.cmp(dst.to_reg(), src, size)?;894self.asm.setcc(kind, dst);895Ok(())896}897898fn float_cmp_with_set(899&mut self,900dst: WritableReg,901src1: Reg,902src2: Reg,903kind: FloatCmpKind,904size: OperandSize,905) -> Result<()> {906// Float comparisons needs to be ordered (that is, comparing with a NaN907// should return 0) except for not equal which needs to be unordered.908// We use ucomis{s, d} because comis{s, d} has an undefined result if909// either operand is NaN. Since ucomis{s, d} is unordered, we need to910// compensate to make the comparison ordered. Ucomis{s, d} sets the911// ZF, PF, and CF flags if there is an unordered result.912let (src1, src2, set_kind) = match kind {913FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),914FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),915FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),916FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),917// Reversing the operands and using the complementary comparison918// avoids needing to perform an additional SETNP and AND919// instruction.920// SETNB and SETNBE check if the carry flag is unset (i.e., not921// less than and not unordered) so we get the intended result922// without having to look at the parity flag.923FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),924FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),925};926self.asm.ucomis(src1, src2, size);927self.asm.setcc(set_kind, dst);928let _ = match kind {929FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {930// Return false if either operand is NaN by ensuring PF is931// unset.932self.with_scratch::<IntScratch, _>(|masm, scratch| {933masm.asm.setnp(scratch.writable());934masm.asm.and_rr(scratch.inner(), dst, size);935});936}937FloatCmpKind::Ne => {938// Return true if either operand is NaN by checking if PF is939// set.940self.with_scratch::<IntScratch, _>(|masm, scratch| {941masm.asm.setp(scratch.writable());942masm.asm.or_rr(scratch.inner(), dst, size);943});944}945FloatCmpKind::Lt | FloatCmpKind::Le => (),946};947Ok(())948}949950fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {951if self.flags.has_lzcnt() {952self.asm.lzcnt(src, dst, size);953} else {954self.with_scratch::<IntScratch, _>(|masm, scratch| {955// Use the following approach:956// dst = size.num_bits() - bsr(src) - is_not_zero957// = size.num.bits() + -bsr(src) - is_not_zero.958masm.asm.bsr(src, dst, size);959masm.asm.setcc(IntCmpKind::Ne, scratch.writable());960masm.asm.neg(dst.to_reg(), dst, size);961masm.asm.add_ir(size.num_bits() as i32, dst, size);962masm.asm.sub_rr(scratch.inner(), dst, size);963});964}965966Ok(())967}968969fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {970if self.flags.has_bmi1() {971self.asm.tzcnt(src, dst, size);972} else {973self.with_scratch::<IntScratch, _>(|masm, scratch| {974// Use the following approach:975// dst = bsf(src) + (is_zero * size.num_bits())976// = bsf(src) + (is_zero << size.log2()).977// BSF outputs the correct value for every value except 0.978// When the value is 0, BSF outputs 0, correct output for ctz is979// the number of bits.980masm.asm.bsf(src, dst, size);981masm.asm.setcc(IntCmpKind::Eq, scratch.writable());982masm.asm983.shift_ir(size.log2(), scratch.writable(), ShiftKind::Shl, size);984masm.asm.add_rr(scratch.inner(), dst, size);985});986}987988Ok(())989}990991fn get_label(&mut self) -> Result<MachLabel> {992let buffer = self.asm.buffer_mut();993Ok(buffer.get_label())994}995996fn bind(&mut self, label: MachLabel) -> Result<()> {997let buffer = self.asm.buffer_mut();998buffer.bind_label(label, &mut Default::default());999Ok(())1000}10011002fn branch(1003&mut self,1004kind: IntCmpKind,1005lhs: Reg,1006rhs: RegImm,1007taken: MachLabel,1008size: OperandSize,1009) -> Result<()> {1010use IntCmpKind::*;10111012match &(lhs, rhs) {1013(rlhs, RegImm::Reg(rrhs)) => {1014// If the comparison kind is zero or not zero and both operands1015// are the same register, emit a test instruction. Else we emit1016// a normal comparison.1017if (kind == Eq || kind == Ne) && (rlhs == rrhs) {1018self.asm.test_rr(*rlhs, *rrhs, size);1019} else {1020self.cmp(lhs, rhs, size)?;1021}1022}1023_ => self.cmp(lhs, rhs, size)?,1024}1025self.asm.jmp_if(kind, taken);1026Ok(())1027}10281029fn jmp(&mut self, target: MachLabel) -> Result<()> {1030self.asm.jmp(target);1031Ok(())1032}10331034fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {1035let src = context.pop_to_reg(self, None)?;1036if self.flags.has_popcnt() && self.flags.has_sse42() {1037self.asm.popcnt(src.into(), writable!(src.into()), size);1038context.stack.push(src.into());1039Ok(())1040} else {1041// The fallback functionality here is based on `MacroAssembler::popcnt64` in:1042// https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#49510431044let tmp = writable!(context.any_gpr(self)?);1045let dst = writable!(src.into());1046let (masks, shift_amt) = match size {1047OperandSize::S64 => (1048[10490x5555555555555555, // m110500x3333333333333333, // m210510x0f0f0f0f0f0f0f0f, // m410520x0101010101010101, // h011053],105456u8,1055),1056// 32-bit popcount is the same, except the masks are half as1057// wide and we shift by 24 at the end rather than 561058OperandSize::S32 => (1059[0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],106024u8,1061),1062_ => bail!(CodeGenError::unexpected_operand_size()),1063};1064self.asm.mov_rr(src.into(), tmp, size);10651066// x -= (x >> 1) & m1;1067self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);1068let lhs = dst.to_reg();1069self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;1070self.asm.sub_rr(dst.to_reg(), tmp, size);10711072// x = (x & m2) + ((x >> 2) & m2);1073self.asm.mov_rr(tmp.to_reg(), dst, size);1074// Load `0x3333...` into the scratch reg once, allowing us to use1075// `and_rr` and avoid inadvertently loading it twice as with `and`10761077self.with_scratch::<IntScratch, _>(|masm, scratch| {1078masm.load_constant(&I::i64(masks[1]), scratch.writable(), size)?;1079masm.asm.and_rr(scratch.inner(), dst, size);1080masm.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);1081masm.asm.and_rr(scratch.inner(), tmp, size);1082anyhow::Ok(())1083})?;1084self.asm.add_rr(dst.to_reg(), tmp, size);10851086// x = (x + (x >> 4)) & m4;1087self.asm.mov_rr(tmp.to_reg(), dst, size);1088self.asm.shift_ir(4u8, dst, ShiftKind::ShrU, size);1089self.asm.add_rr(tmp.to_reg(), dst, size);1090let lhs = dst.to_reg();1091self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;10921093// (x * h01) >> shift_amt1094let lhs = dst.to_reg();1095self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;1096self.asm.shift_ir(shift_amt, dst, ShiftKind::ShrU, size);10971098context.stack.push(src.into());1099context.free_reg(tmp.to_reg());11001101Ok(())1102}1103}11041105fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {1106self.asm.mov_rr(src, dst, OperandSize::S32);1107Ok(())1108}11091110fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {1111match kind {1112ExtendKind::Signed(ext) => {1113self.asm.movsx_rr(src, dst, ext);1114}1115ExtendKind::Unsigned(ext) => {1116self.asm.movzx_rr(src, dst, ext);1117}1118}11191120Ok(())1121}11221123fn signed_truncate(1124&mut self,1125dst: WritableReg,1126src: Reg,1127src_size: OperandSize,1128dst_size: OperandSize,1129kind: TruncKind,1130) -> Result<()> {1131self.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {1132masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {1133masm.asm.cvt_float_to_sint_seq(1134src,1135dst,1136gpr_scratch.inner(),1137xmm_scratch.inner(),1138src_size,1139dst_size,1140kind.is_checked(),1141);1142Ok(())1143})1144})1145}11461147fn unsigned_truncate(1148&mut self,1149ctx: &mut CodeGenContext<Emission>,1150src_size: OperandSize,1151dst_size: OperandSize,1152kind: TruncKind,1153) -> Result<()> {1154let dst_ty = match dst_size {1155OperandSize::S32 => WasmValType::I32,1156OperandSize::S64 => WasmValType::I64,1157_ => bail!(CodeGenError::unexpected_operand_size()),1158};11591160ctx.convert_op_with_tmp_reg(1161self,1162dst_ty,1163RegClass::Float,1164|masm, dst, src, tmp_fpr, dst_size| {1165masm.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {1166masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {1167masm.asm.cvt_float_to_uint_seq(1168src,1169writable!(dst),1170gpr_scratch.inner(),1171xmm_scratch.inner(),1172tmp_fpr,1173src_size,1174dst_size,1175kind.is_checked(),1176);1177Ok(())1178})1179})1180},1181)1182}11831184fn signed_convert(1185&mut self,1186dst: WritableReg,1187src: Reg,1188src_size: OperandSize,1189dst_size: OperandSize,1190) -> Result<()> {1191self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);1192Ok(())1193}11941195fn unsigned_convert(1196&mut self,1197dst: WritableReg,1198src: Reg,1199tmp_gpr: Reg,1200src_size: OperandSize,1201dst_size: OperandSize,1202) -> Result<()> {1203// Need to convert unsigned uint32 to uint64 for conversion instruction sequence.1204if let OperandSize::S32 = src_size {1205self.extend(1206writable!(src),1207src,1208ExtendKind::Unsigned(Extend::I64Extend32),1209)?;1210}12111212self.with_scratch::<IntScratch, _>(|masm, scratch| {1213masm.asm1214.cvt_uint64_to_float_seq(src, dst, scratch.inner(), tmp_gpr, dst_size);1215Ok(())1216})1217}12181219fn reinterpret_float_as_int(1220&mut self,1221dst: WritableReg,1222src: Reg,1223size: OperandSize,1224) -> Result<()> {1225self.asm.xmm_to_gpr(src, dst, size);1226Ok(())1227}12281229fn reinterpret_int_as_float(1230&mut self,1231dst: WritableReg,1232src: Reg,1233size: OperandSize,1234) -> Result<()> {1235self.asm.gpr_to_xmm(src, dst, size);1236Ok(())1237}12381239fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {1240self.asm1241.cvt_float_to_float(src, dst, OperandSize::S64, OperandSize::S32);1242Ok(())1243}12441245fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {1246self.asm1247.cvt_float_to_float(src, dst, OperandSize::S32, OperandSize::S64);1248Ok(())1249}12501251fn unreachable(&mut self) -> Result<()> {1252self.asm.trap(TRAP_UNREACHABLE);1253Ok(())1254}12551256fn trap(&mut self, code: TrapCode) -> Result<()> {1257self.asm.trap(code);1258Ok(())1259}12601261fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {1262self.asm.trapif(cc, code);1263Ok(())1264}12651266fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {1267self.asm.test_rr(src, src, self.ptr_size);1268self.asm.trapif(IntCmpKind::Eq, code);1269Ok(())1270}12711272fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {1273// At least one default target.1274debug_assert!(targets.len() >= 1);1275let default_index = targets.len() - 1;1276// Emit bounds check, by conditionally moving the max cases1277// into the given index reg if the contents of the index reg1278// are greater.1279let max = default_index;1280let size = OperandSize::S32;1281self.asm.mov_ir(max as u64, writable!(tmp), size);1282self.asm.cmp_rr(tmp, index, size);1283self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);12841285let default = targets[default_index];1286let rest = &targets[0..default_index];12871288self.with_scratch::<IntScratch, _>(|masm, tmp1| {1289masm.asm1290.jmp_table(rest.into(), default, index, tmp1.inner(), tmp);1291Ok(())1292})1293}12941295fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {1296Ok(self.asm.buffer_mut().start_srcloc(loc))1297}12981299fn end_source_loc(&mut self) -> Result<()> {1300self.asm.buffer_mut().end_srcloc();1301Ok(())1302}13031304fn current_code_offset(&self) -> Result<CodeOffset> {1305Ok(self.asm.buffer().cur_offset())1306}13071308fn add128(1309&mut self,1310dst_lo: WritableReg,1311dst_hi: WritableReg,1312lhs_lo: Reg,1313lhs_hi: Reg,1314rhs_lo: Reg,1315rhs_hi: Reg,1316) -> Result<()> {1317Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;1318Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;1319self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);1320self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);1321Ok(())1322}13231324fn sub128(1325&mut self,1326dst_lo: WritableReg,1327dst_hi: WritableReg,1328lhs_lo: Reg,1329lhs_hi: Reg,1330rhs_lo: Reg,1331rhs_hi: Reg,1332) -> Result<()> {1333Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;1334Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;1335self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);1336self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);1337Ok(())1338}13391340fn mul_wide(1341&mut self,1342context: &mut CodeGenContext<Emission>,1343kind: MulWideKind,1344) -> Result<()> {1345// Reserve rax/rdx since they're required by the `mul_wide` instruction1346// being used here.1347let rax = context.reg(regs::rax(), self)?;1348let rdx = context.reg(regs::rdx(), self)?;13491350// The rhs of this binop can be in any register1351let rhs = context.pop_to_reg(self, None)?;1352// Mark rax as allocatable. and then force the lhs operand to be placed1353// in `rax`.1354context.free_reg(rax);1355let lhs = context.pop_to_reg(self, Some(rax))?;13561357self.asm.mul_wide(1358writable!(rax),1359writable!(rdx),1360lhs.reg,1361rhs.reg,1362kind,1363OperandSize::S64,1364);13651366// No longer using the rhs register after the multiplication has been1367// executed.1368context.free_reg(rhs);13691370// The low bits of the result are in rax, where `lhs` was allocated to1371context.stack.push(lhs.into());1372// The high bits of the result are in rdx, which we previously reserved.1373context.stack.push(Val::Reg(TypedReg::i64(rdx)));13741375Ok(())1376}13771378fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {1379// Get the source and destination operands set up first.1380let (src, dst) = match size {1381// Floats can use the same register for `src` and `dst`.1382SplatKind::F32x4 | SplatKind::F64x2 => {1383let reg = context.pop_to_reg(self, None)?.reg;1384(RegImm::reg(reg), writable!(reg))1385}1386// For ints, we need to load the operand into a vector register if1387// it's not a constant.1388SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {1389let dst = writable!(context.any_fpr(self)?);1390let src = if size == SplatKind::I64x2 {1391context.pop_i64_const().map(RegImm::i64)1392} else {1393context.pop_i32_const().map(RegImm::i32)1394}1395.map_or_else(1396|| -> Result<RegImm> {1397let reg = context.pop_to_reg(self, None)?.reg;1398self.reinterpret_int_as_float(1399dst,1400reg,1401match size {1402SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {1403OperandSize::S321404}1405SplatKind::I64x2 => OperandSize::S64,1406SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),1407},1408)?;1409context.free_reg(reg);1410Ok(RegImm::Reg(dst.to_reg()))1411},1412Ok,1413)?;1414(src, dst)1415}1416};14171418// Perform the splat on the operands.1419if size == SplatKind::I64x2 || size == SplatKind::F64x2 {1420self.ensure_has_avx()?;1421let mask = Self::vpshuf_mask_for_64_bit_splats();1422match src {1423RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),1424RegImm::Imm(imm) => {1425let src = self.asm.add_constant(&imm.to_bytes());1426self.asm1427.xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());1428}1429}1430} else {1431self.ensure_has_avx2()?;14321433match src {1434RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),1435RegImm::Imm(imm) => {1436let src = self.asm.add_constant(&imm.to_bytes());1437self.asm1438.xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());1439}1440}1441}14421443context1444.stack1445.push(Val::reg(dst.to_reg(), WasmValType::V128));1446Ok(())1447}14481449fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {1450self.ensure_has_avx()?;14511452// Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`1453// separately to either the selected index or 0.1454// Then use `vpor` to combine `lhs` and `rhs` into `dst`.1455// Setting the most significant bit in the mask's lane to 1 will1456// result in corresponding lane in the destination register being1457// set to 0. 0x80 sets the most significant bit to 1.1458let mut mask_lhs: [u8; 16] = [0x80; 16];1459let mut mask_rhs: [u8; 16] = [0x80; 16];1460for i in 0..lanes.len() {1461if lanes[i] < 16 {1462mask_lhs[i] = lanes[i];1463} else {1464mask_rhs[i] = lanes[i] - 16;1465}1466}1467let mask_lhs = self.asm.add_constant(&mask_lhs);1468let mask_rhs = self.asm.add_constant(&mask_rhs);14691470self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);1471self.with_scratch::<FloatScratch, _>(|masm, scratch| {1472masm.asm.xmm_vpshufb_rrm(scratch.writable(), rhs, &mask_rhs);1473masm.asm.xmm_vpor_rrr(dst, dst.to_reg(), scratch.inner());1474Ok(())1475})1476}14771478fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {1479self.ensure_has_avx()?;14801481// Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything1482// outside that range.1483// Each lane is a signed byte so the maximum value is 0x7F. Adding1484// 0x70 to any value higher than 0xF will saturate resulting in a value1485// of 0xFF (i.e., 0).1486let clamp = self.asm.add_constant(&[0x70; 16]);1487self.asm1488.xmm_vpaddus_rrm(writable!(rhs), rhs, &clamp, OperandSize::S8);14891490// Don't need to subtract 0x70 since `vpshufb` uses the least1491// significant 4 bits which are the same after adding 0x70.1492self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);1493Ok(())1494}14951496fn atomic_rmw(1497&mut self,1498context: &mut CodeGenContext<Emission>,1499addr: Self::Address,1500size: OperandSize,1501op: RmwOp,1502flags: MemFlags,1503extend: Option<Extend<Zero>>,1504) -> Result<()> {1505let res = match op {1506RmwOp::Add => {1507let operand = context.pop_to_reg(self, None)?;1508self.asm1509.lock_xadd(addr, writable!(operand.reg), size, flags);1510operand.reg1511}1512RmwOp::Sub => {1513let operand = context.pop_to_reg(self, None)?;1514self.asm.neg(operand.reg, writable!(operand.reg), size);1515self.asm1516.lock_xadd(addr, writable!(operand.reg), size, flags);1517operand.reg1518}1519RmwOp::Xchg => {1520let operand = context.pop_to_reg(self, None)?;1521self.asm.xchg(addr, writable!(operand.reg), size, flags);1522operand.reg1523}1524RmwOp::And | RmwOp::Or | RmwOp::Xor => {1525let op = match op {1526RmwOp::And => AtomicRmwSeqOp::And,1527RmwOp::Or => AtomicRmwSeqOp::Or,1528RmwOp::Xor => AtomicRmwSeqOp::Xor,1529_ => unreachable!(1530"invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"1531),1532};1533let dst = context.reg(regs::rax(), self)?;1534let operand = context.pop_to_reg(self, None)?;15351536self.with_scratch::<IntScratch, _>(|masm, scratch| {1537masm.asm.atomic_rmw_seq(1538addr,1539operand.reg,1540writable!(dst),1541scratch.writable(),1542size,1543flags,1544op,1545);1546});15471548context.free_reg(operand.reg);1549dst1550}1551};15521553let dst_ty = match extend {1554Some(ext) => {1555// We don't need to zero-extend from 32 to 64bits.1556if !(ext.from_bits() == 32 && ext.to_bits() == 64) {1557self.asm.movzx_rr(res, writable!(res), ext);1558}15591560WasmValType::int_from_bits(ext.to_bits())1561}1562None => WasmValType::int_from_bits(size.num_bits()),1563};15641565context.stack.push(TypedReg::new(dst_ty, res).into());15661567Ok(())1568}15691570fn extract_lane(1571&mut self,1572src: Reg,1573dst: WritableReg,1574lane: u8,1575kind: ExtractLaneKind,1576) -> Result<()> {1577self.ensure_has_avx()?;15781579match kind {1580ExtractLaneKind::I8x16S1581| ExtractLaneKind::I8x16U1582| ExtractLaneKind::I16x8S1583| ExtractLaneKind::I16x8U1584| ExtractLaneKind::I32x41585| ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),1586ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {1587// If the `src` and `dst` registers are the same, then the1588// appropriate value is already in the correct position in1589// the register.1590assert!(src == dst.to_reg());1591}1592ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),1593ExtractLaneKind::F64x2 => {1594// `0b11_10` selects the high and low 32-bits of the second1595// 64-bit, so `0b11_10_11_10` splats the 64-bit value across1596// both lanes. Since we put an `f64` on the stack, we use1597// the splatted value.1598// Double-check `lane == 0` was handled in another branch.1599assert!(lane == 1);1600self.asm1601.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)1602}1603}16041605// Sign-extend to 32-bits for sign extended kinds.1606match kind {1607ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {1608self.asm.movsx_rr(dst.to_reg(), dst, kind.into())1609}1610_ => (),1611}16121613Ok(())1614}16151616fn replace_lane(1617&mut self,1618src: RegImm,1619dst: WritableReg,1620lane: u8,1621kind: ReplaceLaneKind,1622) -> Result<()> {1623self.ensure_has_avx()?;16241625match kind {1626ReplaceLaneKind::I8x161627| ReplaceLaneKind::I16x81628| ReplaceLaneKind::I32x41629| ReplaceLaneKind::I64x2 => match src {1630RegImm::Reg(reg) => {1631self.asm1632.xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());1633}1634RegImm::Imm(imm) => {1635let address = self.asm.add_constant(&imm.to_bytes());1636self.asm1637.xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());1638}1639},1640ReplaceLaneKind::F32x4 => {1641// Immediate for `vinsertps` uses first 3 bits to determine1642// which elements of the destination to set to 0. The next 21643// bits specify which element of the destination will be1644// overwritten.1645let imm = lane << 4;1646match src {1647RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),1648RegImm::Imm(val) => {1649let address = self.asm.add_constant(&val.to_bytes());1650self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);1651}1652}1653}1654ReplaceLaneKind::F64x2 => match src {1655RegImm::Reg(reg) => match lane {16560 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),16571 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),1658_ => unreachable!(),1659},1660RegImm::Imm(imm) => {1661let address = self.asm.add_constant(&imm.to_bytes());1662match lane {16630 => {1664// Memory load variant of `vmovsd` zeroes the upper1665// 64 bits of the register so need to load the1666// immediate to a register to use the register1667// variant of `vmovsd` to perform the merge.16681669self.with_scratch::<FloatScratch, _>(|masm, scratch| {1670masm.asm.xmm_vmovsd_rm(scratch.writable(), &address);1671masm.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.inner());1672});1673}16741 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),1675_ => unreachable!(),1676}1677}1678},1679}1680Ok(())1681}16821683fn atomic_cas(1684&mut self,1685context: &mut CodeGenContext<Emission>,1686addr: Self::Address,1687size: OperandSize,1688flags: MemFlags,1689extend: Option<Extend<Zero>>,1690) -> Result<()> {1691// `cmpxchg` expects `expected` to be in the `*a*` register.1692// reserve rax for the expected argument.1693let rax = context.reg(regs::rax(), self)?;16941695let replacement = context.pop_to_reg(self, None)?;16961697// mark `rax` as allocatable again.1698context.free_reg(rax);1699let expected = context.pop_to_reg(self, Some(regs::rax()))?;17001701self.asm1702.cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags);17031704if let Some(extend) = extend {1705// We don't need to zero-extend from 32 to 64bits.1706if !(extend.from_bits() == 32 && extend.to_bits() == 64) {1707self.asm1708.movzx_rr(expected.reg, writable!(expected.reg), extend);1709}1710}17111712context.stack.push(expected.into());1713context.free_reg(replacement);17141715Ok(())1716}17171718fn v128_eq(1719&mut self,1720dst: WritableReg,1721lhs: Reg,1722rhs: Reg,1723kind: VectorEqualityKind,1724) -> Result<()> {1725self.ensure_has_avx()?;17261727match kind {1728VectorEqualityKind::I8x161729| VectorEqualityKind::I16x81730| VectorEqualityKind::I32x41731| VectorEqualityKind::I64x2 => {1732self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())1733}1734VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {1735self.asm1736.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)1737}1738}1739Ok(())1740}17411742fn v128_ne(1743&mut self,1744dst: WritableReg,1745lhs: Reg,1746rhs: Reg,1747kind: VectorEqualityKind,1748) -> Result<()> {1749self.ensure_has_avx()?;17501751match kind {1752VectorEqualityKind::I8x161753| VectorEqualityKind::I16x81754| VectorEqualityKind::I32x41755| VectorEqualityKind::I64x2 => {1756// Check for equality and invert the results.1757self.asm1758.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1759self.asm1760.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1761self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1762}1763VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {1764self.asm1765.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)1766}1767}1768Ok(())1769}17701771fn v128_lt(1772&mut self,1773dst: WritableReg,1774lhs: Reg,1775rhs: Reg,1776kind: VectorCompareKind,1777) -> Result<()> {1778self.ensure_has_avx()?;17791780match kind {1781VectorCompareKind::I8x16S1782| VectorCompareKind::I16x8S1783| VectorCompareKind::I32x4S1784| VectorCompareKind::I64x2S => {1785// Perform a greater than check with reversed parameters.1786self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())1787}1788VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1789// Set `lhs` to min values, check for equality, then invert the1790// result.1791// If `lhs` is smaller, then equality check will fail and result1792// will be inverted to true. Otherwise the equality check will1793// pass and be inverted to false.1794self.asm1795.xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1796self.asm1797.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1798self.asm1799.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1800self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1801}1802VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1803self.asm1804.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)1805}1806}1807Ok(())1808}18091810fn v128_le(1811&mut self,1812dst: WritableReg,1813lhs: Reg,1814rhs: Reg,1815kind: VectorCompareKind,1816) -> Result<()> {1817self.ensure_has_avx()?;18181819match kind {1820VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {1821// Set the `rhs` vector to the signed minimum values and then1822// compare them with `lhs` for equality.1823self.asm1824.xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1825self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1826}1827VectorCompareKind::I64x2S => {1828// Do a greater than check and invert the results.1829self.asm1830.xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1831self.asm1832.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1833self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1834}1835VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1836// Set the `rhs` vector to the signed minimum values and then1837// compare them with `lhs` for equality.1838self.asm1839.xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1840self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1841}1842VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1843self.asm1844.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)1845}1846}1847Ok(())1848}18491850fn v128_gt(1851&mut self,1852dst: WritableReg,1853lhs: Reg,1854rhs: Reg,1855kind: VectorCompareKind,1856) -> Result<()> {1857self.ensure_has_avx()?;18581859match kind {1860VectorCompareKind::I8x16S1861| VectorCompareKind::I16x8S1862| VectorCompareKind::I32x4S1863| VectorCompareKind::I64x2S => {1864self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())1865}1866VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1867// Set `lhs` to max values, check for equality, then invert the1868// result.1869// If `lhs` is larger, then equality check will fail and result1870// will be inverted to true. Otherwise the equality check will1871// pass and be inverted to false.1872self.asm1873.xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1874self.asm1875.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());1876self.asm1877.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());1878self.asm.xmm_vpxor_rrr(lhs, rhs, dst);1879}1880VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1881// Do a less than comparison with the operands swapped.1882self.asm1883.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)1884}1885}1886Ok(())1887}18881889fn v128_ge(1890&mut self,1891dst: WritableReg,1892lhs: Reg,1893rhs: Reg,1894kind: VectorCompareKind,1895) -> Result<()> {1896self.ensure_has_avx()?;18971898match kind {1899VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {1900// Set each lane to maximum value and then compare for equality.1901self.asm1902.xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1903self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1904}1905VectorCompareKind::I64x2S => {1906// Perform a greater than comparison with operands swapped,1907// then invert the results.1908self.asm1909.xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());1910self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());1911self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);1912}1913VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {1914// Set lanes to maximum values and compare them for equality.1915self.asm1916.xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());1917self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());1918}1919VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {1920// Perform a less than or equal comparison on swapped operands.1921self.asm1922.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)1923}1924}19251926Ok(())1927}19281929fn fence(&mut self) -> Result<()> {1930self.asm.mfence();1931Ok(())1932}19331934fn v128_not(&mut self, dst: WritableReg) -> Result<()> {1935self.ensure_has_avx()?;19361937self.with_scratch::<FloatScratch, _>(|masm, tmp| {1938// First, we initialize `tmp` with all ones by comparing it with1939// itself.1940masm.asm1941.xmm_vpcmpeq_rrr(tmp.writable(), tmp.inner(), tmp.inner(), OperandSize::S32);1942// Then we `xor` tmp and `dst` together, yielding `!dst`.1943masm.asm.xmm_vpxor_rrr(tmp.inner(), dst.to_reg(), dst);1944Ok(())1945})1946}19471948fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1949self.ensure_has_avx()?;1950self.asm.xmm_vpand_rrr(src1, src2, dst);1951Ok(())1952}19531954fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1955self.ensure_has_avx()?;1956self.asm.xmm_vpandn_rrr(src1, src2, dst);1957Ok(())1958}19591960fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1961self.ensure_has_avx()?;1962self.asm.xmm_vpor_rrr(dst, src1, src2);1963Ok(())1964}19651966fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {1967self.ensure_has_avx()?;1968self.asm.xmm_vpxor_rrr(src1, src2, dst);1969Ok(())1970}19711972fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {1973self.ensure_has_avx()?;19741975self.with_scratch::<FloatScratch, _>(|masm, tmp| {1976masm.v128_and(src1, mask, tmp.writable())?;1977masm.v128_and_not(mask, src2, dst)?;1978masm.v128_or(dst.to_reg(), tmp.inner(), dst)?;1979Ok(())1980})1981}19821983fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {1984self.ensure_has_avx()?;1985self.asm.xmm_vptest(src, src);1986self.asm.setcc(IntCmpKind::Ne, dst);1987Ok(())1988}19891990fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {1991self.ensure_has_avx()?;1992match kind {1993V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),1994V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),1995V128ConvertKind::I32x4U => {1996self.with_scratch::<FloatScratch, _>(|masm, scratch| {1997// Split each 32-bit integer into 16-bit parts.1998// `scratch` will contain the low bits and `dst` will contain1999// the high bits.2000masm.asm2001.xmm_vpsll_rri(src, scratch.writable(), 0x10, kind.src_lane_size());2002masm.asm.xmm_vpsrl_rri(2003scratch.inner(),2004scratch.writable(),20050x10,2006kind.src_lane_size(),2007);2008masm.asm2009.xmm_vpsub_rrr(src, scratch.inner(), dst, kind.src_lane_size());20102011// Convert the low bits in `scratch` to floating point numbers.2012masm.asm2013.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);20142015// Prevent overflow by right shifting high bits.2016masm.asm2017.xmm_vpsrl_rri(dst.to_reg(), dst, 1, kind.src_lane_size());2018// Convert high bits in `dst` to floating point numbers.2019masm.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);2020// Double high bits in `dst` to reverse right shift.2021masm.asm2022.xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());2023// Add high bits in `dst` to low bits in `scratch`.2024masm.asm.xmm_vaddp_rrr(2025dst.to_reg(),2026scratch.inner(),2027dst,2028kind.src_lane_size(),2029);2030});2031}2032V128ConvertKind::I32x4LowU => {2033// See2034// https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L36682035// for details on the Cranelift AVX implementation.2036// Use `vunpcklp` to create doubles from the integers.2037// Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers2038// creates a byte array for a double that sets the mantissa2039// bits to the original integer value.2040let conversion_constant = self2041.asm2042.add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);2043self.asm2044.xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());2045// Subtract the 0x1.0p52 added above.2046let conversion_constant = self.asm.add_constant(&[20470x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,20480x00, 0x30, 0x43,2049]);2050self.asm.xmm_vsub_rrm(2051dst.to_reg(),2052&conversion_constant,2053dst,2054kind.dst_lane_size(),2055);2056}2057}2058Ok(())2059}20602061fn v128_narrow(2062&mut self,2063src1: Reg,2064src2: Reg,2065dst: WritableReg,2066kind: V128NarrowKind,2067) -> Result<()> {2068self.ensure_has_avx()?;2069match kind {2070V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {2071self.asm2072.xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())2073}2074V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {2075self.asm2076.xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())2077}2078}2079Ok(())2080}20812082fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {2083self.ensure_has_avx()?;2084self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);2085Ok(())2086}20872088fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {2089self.ensure_has_avx()?;2090self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);2091Ok(())2092}20932094fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {2095self.ensure_has_avx()?;2096match kind {2097V128ExtendKind::LowI8x16S2098| V128ExtendKind::LowI8x16U2099| V128ExtendKind::LowI16x8S2100| V128ExtendKind::LowI16x8U2101| V128ExtendKind::LowI32x4S2102| V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),2103V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {2104self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);2105self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());2106}2107V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {2108self.with_scratch::<FloatScratch, _>(|masm, scratch| {2109masm.asm2110.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());2111masm.asm2112.xmm_vpunpckh_rrr(src, scratch.inner(), dst, kind.src_lane_size());2113});2114}2115V128ExtendKind::HighI32x4S => {2116// Move the 3rd element (i.e., 0b10) to the 1st (rightmost)2117// position and the 4th element (i.e., 0b11) to the 2nd (second2118// from the right) position and then perform the extend.2119self.asm2120.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());2121self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());2122}2123V128ExtendKind::HighI32x4U => {2124self.with_scratch::<FloatScratch, _>(|masm, scratch| {2125// Set `scratch` to a vector 0s.2126masm.asm.xmm_vxorp_rrr(2127scratch.inner(),2128scratch.inner(),2129scratch.writable(),2130kind.src_lane_size(),2131);2132// Interleave the 0 bits into the two 32-bit integers to zero extend them.2133masm.asm2134.xmm_vunpckhp_rrr(src, scratch.inner(), dst, kind.src_lane_size());2135});2136}2137}2138Ok(())2139}21402141fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {2142self.ensure_has_avx()?;2143match kind {2144V128AddKind::F32x4 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S32),2145V128AddKind::F64x2 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S64),2146V128AddKind::I8x16 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S8),2147V128AddKind::I8x16SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S8),2148V128AddKind::I8x16SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S8),2149V128AddKind::I16x8 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S16),2150V128AddKind::I16x8SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S16),2151V128AddKind::I16x8SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S16),2152V128AddKind::I32x4 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S32),2153V128AddKind::I64x2 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S64),2154};2155Ok(())2156}21572158fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {2159self.ensure_has_avx()?;2160match kind {2161V128SubKind::F32x4 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S32),2162V128SubKind::F64x2 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S64),2163V128SubKind::I8x16 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S8),2164V128SubKind::I8x16SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S8),2165V128SubKind::I8x16SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S8),2166V128SubKind::I16x8 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S16),2167V128SubKind::I16x8SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S16),2168V128SubKind::I16x8SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S16),2169V128SubKind::I32x4 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S32),2170V128SubKind::I64x2 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S64),2171};2172Ok(())2173}21742175fn v128_mul(2176&mut self,2177context: &mut CodeGenContext<Emission>,2178kind: V128MulKind,2179) -> Result<()> {2180self.ensure_has_avx()?;21812182let rhs = context.pop_to_reg(self, None)?;2183let lhs = context.pop_to_reg(self, None)?;21842185let mul_i64x2_avx512 = |this: &mut Self| {2186this.asm.vpmullq(lhs.reg, rhs.reg, writable!(lhs.reg));2187};21882189let mul_i64x2_fallback = |this: &mut Self,2190context: &mut CodeGenContext<Emission>|2191-> Result<()> {2192// Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback2193// to an instruction sequence using 32bits multiplication (taken from cranelift2194// implementation, in `isa/x64/lower.isle`):2195//2196// > Otherwise, for i64x2 multiplication we describe a lane A as being composed of2197// > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand2198// > multiplication can then be written as:2199//2200// > Ah Al2201// > * Bh Bl2202// > -----2203// > Al * Bl2204// > + (Ah * Bl) << 322205// > + (Al * Bh) << 322206//2207// > So for each lane we will compute:2208//2209// > A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 322210//2211// > Note, the algorithm will use `pmuludq` which operates directly on the lower2212// > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of2213// > the lane of the destination. For this reason we don't need shifts to isolate2214// > the lower 32-bits, however, we will need to use shifts to isolate the high2215// > 32-bits when doing calculations, i.e., `Ah == A >> 32`.22162217let tmp2 = context.any_fpr(this)?;2218this.with_scratch::<FloatScratch, _>(|this, tmp1| {2219// tmp1 = lhs_hi = (lhs >> 32)2220this.asm2221.xmm_vpsrl_rri(lhs.reg, tmp1.writable(), 32, OperandSize::S64);22222223// tmp2 = lhs_hi * rhs_low = tmp1 * rhs2224this.asm2225.xmm_vpmuldq_rrr(tmp1.inner(), rhs.reg, writable!(tmp2));22262227// tmp1 = rhs_hi = rhs >> 322228this.asm2229.xmm_vpsrl_rri(rhs.reg, tmp1.writable(), 32, OperandSize::S64);22302231// tmp1 = lhs_low * rhs_high = tmp1 * lhs2232this.asm2233.xmm_vpmuludq_rrr(tmp1.inner(), lhs.reg, tmp1.writable());22342235// tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp22236this.asm2237.xmm_vpadd_rrr(tmp1.inner(), tmp2, tmp1.writable(), OperandSize::S64);22382239//tmp1 = tmp1 << 322240this.asm2241.xmm_vpsll_rri(tmp1.inner(), tmp1.writable(), 32, OperandSize::S64);22422243// tmp2 = lhs_lo + rhs_lo2244this.asm.xmm_vpmuludq_rrr(lhs.reg, rhs.reg, writable!(tmp2));22452246// finally, with `lhs` as destination:2247// lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp22248this.asm2249.xmm_vpadd_rrr(tmp1.inner(), tmp2, writable!(lhs.reg), OperandSize::S64);2250});22512252context.free_reg(tmp2);22532254Ok(())2255};22562257match kind {2258V128MulKind::F32x4 => {2259self.asm2260.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)2261}2262V128MulKind::F64x2 => {2263self.asm2264.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S64)2265}2266V128MulKind::I16x8 => {2267self.asm2268.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S16)2269}2270V128MulKind::I32x4 => {2271self.asm2272.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)2273}2274// This is the fast path when AVX512 is available.2275V128MulKind::I64x22276if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>2277{2278mul_i64x2_avx512(self)2279}2280// Otherwise, we emit AVX fallback sequence.2281V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,2282}22832284context.stack.push(lhs.into());2285context.free_reg(rhs);22862287Ok(())2288}22892290fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {2291self.ensure_has_avx()?;22922293match kind {2294V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {2295self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())2296}2297V128AbsKind::I64x2 => {2298self.with_scratch::<FloatScratch, _>(|masm, scratch| {2299// Perform an arithmetic right shift of 31 bits. If the number2300// is positive, this will result in all zeroes in the upper2301// 32-bits. If the number is negative, this will result in all2302// ones in the upper 32-bits.2303masm.asm2304.xmm_vpsra_rri(src, scratch.writable(), 0x1f, OperandSize::S32);2305// Copy the ones and zeroes in the high bits of each 64-bit2306// lane to the low bits of each 64-bit lane.2307masm.asm.xmm_vpshuf_rr(2308scratch.inner(),2309scratch.writable(),23100b11_11_01_01,2311OperandSize::S32,2312);2313// Flip the bits in lanes that were negative in `src` and leave2314// the positive lanes as they are. Positive lanes will have a2315// zero mask in `scratch` so xor doesn't affect them.2316masm.asm.xmm_vpxor_rrr(src, scratch.inner(), dst);2317// Subtract the mask from the results of xor which will2318// complete the two's complement for lanes which were negative.2319masm.asm2320.xmm_vpsub_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());2321});2322}2323V128AbsKind::F32x4 | V128AbsKind::F64x2 => {2324self.with_scratch::<FloatScratch, _>(|masm, scratch| {2325// Create a mask of all ones.2326masm.asm.xmm_vpcmpeq_rrr(2327scratch.writable(),2328scratch.inner(),2329scratch.inner(),2330kind.lane_size(),2331);2332// Right shift the mask so each lane is a single zero followed2333// by all ones.2334masm.asm.xmm_vpsrl_rri(2335scratch.inner(),2336scratch.writable(),23370x1,2338kind.lane_size(),2339);2340// Use the mask to zero the sign bit in each lane which will2341// make the float value positive.2342masm.asm2343.xmm_vandp_rrr(src, scratch.inner(), dst, kind.lane_size());2344});2345}2346}2347Ok(())2348}23492350fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {2351self.ensure_has_avx()?;23522353match kind {2354V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {2355self.with_scratch::<FloatScratch, _>(|masm, tmp| {2356masm.v128_xor(tmp.inner(), tmp.inner(), tmp.writable())?;2357masm.v128_sub(tmp.inner(), op.to_reg(), op, kind.into())?;2358anyhow::Ok(())2359})?;2360}2361V128NegKind::F32x4 | V128NegKind::F64x2 => {2362self.with_scratch::<FloatScratch, _>(|masm, tmp| {2363// Create a mask of all 1s.2364masm.asm.xmm_vpcmpeq_rrr(2365tmp.writable(),2366tmp.inner(),2367tmp.inner(),2368kind.lane_size(),2369);2370// Left shift the lanes in the mask so only the sign bit in the2371// mask is set to 1.2372masm.asm.xmm_vpsll_rri(2373tmp.inner(),2374tmp.writable(),2375(kind.lane_size().num_bits() - 1) as u32,2376kind.lane_size(),2377);2378// Use the mask to flip the sign bit.2379masm.asm2380.xmm_vxorp_rrr(op.to_reg(), tmp.inner(), op, kind.lane_size());2381});2382}2383}2384Ok(())2385}23862387fn v128_shift(2388&mut self,2389context: &mut CodeGenContext<Emission>,2390lane_width: OperandSize,2391kind: ShiftKind,2392) -> Result<()> {2393self.ensure_has_avx()?;2394let shift_amount = context.pop_to_reg(self, None)?.reg;2395let operand = context.pop_to_reg(self, None)?.reg;2396let amount_mask = lane_width.num_bits() - 1;23972398self.and(2399writable!(shift_amount),2400shift_amount,2401RegImm::i32(amount_mask as i32),2402OperandSize::S32,2403)?;24042405self.with_scratch::<IntScratch, _>(|masm, tmp| {2406masm.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {2407let move_to_tmp_xmm = |this: &mut Self| {2408this.asm2409.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);2410};24112412// A helper for deciding between `vpsllw` and `vpsrlw` in2413// `shift_i8x16`.2414enum Direction {2415Left,2416Right,2417}24182419let shift_i8x16 = |this: &mut Self, masks: &'static [u8], direction: Direction| {2420// The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit2421// shift instruction. Instead, we shift as 16bits, and then mask the bits in the2422// 8bits lane, for example (with 2 8bits lanes):2423// - Before shifting:2424// 01001101 111011102425// - shifting by 2 left:2426// 00110111 101110002427// ^^_ these bits come from the previous byte, and need to be masked.2428// - The mask:2429// 11111100 111111112430// - After masking:2431// 00110100 101110002432//2433// The mask is loaded from a well known memory, depending on the shift amount.24342435this.asm2436.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);24372438// Perform the 16-bit shift.2439match direction {2440Direction::Left => this.asm.xmm_vpsll_rrr(2441operand,2442tmp_xmm.inner(),2443writable!(operand),2444OperandSize::S16,2445),2446Direction::Right => this.asm.xmm_vpsrl_rrr(2447operand,2448tmp_xmm.inner(),2449writable!(operand),2450OperandSize::S16,2451),2452}24532454// Get a handle to the masks array constant.2455let masks_addr = this.asm.add_constant(masks);24562457// Load the masks array effective address into the tmp register.2458this.asm.lea(&masks_addr, tmp.writable(), OperandSize::S64);24592460// Compute the offset of the mask that we need to use. This is shift_amount * 16 ==2461// shift_amount << 4.2462this.asm2463.shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);24642465// Load the mask to tmp_xmm.2466this.asm.xmm_vmovdqu_mr(2467&Address::ImmRegRegShift {2468simm32: 0,2469base: tmp.inner(),2470index: shift_amount,2471shift: 0,2472},2473tmp_xmm.writable(),2474MemFlags::trusted(),2475);24762477// Mask unwanted bits from operand.2478this.asm2479.xmm_vpand_rrr(tmp_xmm.inner(), operand, writable!(operand));2480};24812482let i64x2_shr_s = |this: &mut Self,2483context: &mut CodeGenContext<Emission>|2484-> Result<()> {2485const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;24862487// AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the2488// following formula (from hacker's delight 2-7), where x is the value and n the shift2489// amount, for each lane:2490// t = (1 << 63) >> n; ((x >> n) ^ t) - t24912492// We need an extra scratch register:2493let tmp_xmm2 = context.any_fpr(this)?;24942495this.asm2496.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);24972498let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());24992500this.asm2501.xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());2502this.asm.xmm_vpsrl_rrr(2503tmp_xmm2,2504tmp_xmm.inner(),2505writable!(tmp_xmm2),2506OperandSize::S64,2507);2508this.asm.xmm_vpsrl_rrr(2509operand,2510tmp_xmm.inner(),2511writable!(operand),2512OperandSize::S64,2513);2514this.asm2515.xmm_vpxor_rrr(operand, tmp_xmm2, writable!(operand));2516this.asm2517.xmm_vpsub_rrr(operand, tmp_xmm2, writable!(operand), OperandSize::S64);25182519context.free_reg(tmp_xmm2);25202521Ok(())2522};25232524let i8x16_shr_s = |this: &mut Self,2525context: &mut CodeGenContext<Emission>|2526-> Result<()> {2527// Since the x86 instruction set does not have an 8x16 shift instruction and the2528// approach used for `ishl` and `ushr` cannot be easily used (the masks do not2529// preserve the sign), we use a different approach here: separate the low and2530// high lanes, shift them separately, and merge them into the final result.2531//2532// Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,2533// s15]:2534//2535// lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]2536// shifted_lo.i16x8 = shift each lane of `low`2537// hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]2538// shifted_hi.i16x8 = shift each lane of `high`2539// result = [s0'', s1'', ..., s15'']25402541// In order for `packsswb` later to only use the high byte of each2542// 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to2543// fill in the upper bits appropriately.2544this.asm2545.add_ir(8, writable!(shift_amount), OperandSize::S32);2546this.asm2547.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);25482549let tmp_lo = context.any_fpr(this)?;2550let tmp_hi = context.any_fpr(this)?;25512552// Extract lower and upper bytes.2553this.asm2554.xmm_vpunpckl_rrr(operand, operand, writable!(tmp_lo), OperandSize::S8);2555this.asm2556.xmm_vpunpckh_rrr(operand, operand, writable!(tmp_hi), OperandSize::S8);25572558// Perform 16bit right shift of upper and lower bytes.2559this.asm.xmm_vpsra_rrr(2560tmp_lo,2561tmp_xmm.inner(),2562writable!(tmp_lo),2563OperandSize::S16,2564);2565this.asm.xmm_vpsra_rrr(2566tmp_hi,2567tmp_xmm.inner(),2568writable!(tmp_hi),2569OperandSize::S16,2570);25712572// Merge lower and upper bytes back.2573this.asm2574.xmm_vpackss_rrr(tmp_lo, tmp_hi, writable!(operand), OperandSize::S8);25752576context.free_reg(tmp_lo);2577context.free_reg(tmp_hi);25782579Ok(())2580};25812582match (lane_width, kind) {2583// shl2584(OperandSize::S8, ShiftKind::Shl) => {2585shift_i8x16(masm, &I8X16_ISHL_MASKS, Direction::Left)2586}2587(OperandSize::S16, ShiftKind::Shl) => {2588move_to_tmp_xmm(masm);2589masm.asm.xmm_vpsll_rrr(2590operand,2591tmp_xmm.inner(),2592writable!(operand),2593OperandSize::S16,2594);2595}2596(OperandSize::S32, ShiftKind::Shl) => {2597move_to_tmp_xmm(masm);2598masm.asm.xmm_vpsll_rrr(2599operand,2600tmp_xmm.inner(),2601writable!(operand),2602OperandSize::S32,2603);2604}2605(OperandSize::S64, ShiftKind::Shl) => {2606move_to_tmp_xmm(masm);2607masm.asm.xmm_vpsll_rrr(2608operand,2609tmp_xmm.inner(),2610writable!(operand),2611OperandSize::S64,2612);2613}2614// shr_u2615(OperandSize::S8, ShiftKind::ShrU) => {2616shift_i8x16(masm, &I8X16_USHR_MASKS, Direction::Right)2617}2618(OperandSize::S16, ShiftKind::ShrU) => {2619move_to_tmp_xmm(masm);2620masm.asm.xmm_vpsrl_rrr(2621operand,2622tmp_xmm.inner(),2623writable!(operand),2624OperandSize::S16,2625);2626}2627(OperandSize::S32, ShiftKind::ShrU) => {2628move_to_tmp_xmm(masm);2629masm.asm.xmm_vpsrl_rrr(2630operand,2631tmp_xmm.inner(),2632writable!(operand),2633OperandSize::S32,2634);2635}2636(OperandSize::S64, ShiftKind::ShrU) => {2637move_to_tmp_xmm(masm);2638masm.asm.xmm_vpsrl_rrr(2639operand,2640tmp_xmm.inner(),2641writable!(operand),2642OperandSize::S64,2643);2644}2645// shr_s2646(OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(masm, context)?,2647(OperandSize::S16, ShiftKind::ShrS) => {2648move_to_tmp_xmm(masm);2649masm.asm.xmm_vpsra_rrr(2650operand,2651tmp_xmm.inner(),2652writable!(operand),2653OperandSize::S16,2654);2655}2656(OperandSize::S32, ShiftKind::ShrS) => {2657move_to_tmp_xmm(masm);2658masm.asm.xmm_vpsra_rrr(2659operand,2660tmp_xmm.inner(),2661writable!(operand),2662OperandSize::S32,2663);2664}2665(OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(masm, context)?,26662667_ => bail!(CodeGenError::invalid_operand_combination()),2668}26692670Ok(())2671})2672})?;26732674context.free_reg(shift_amount);2675context2676.stack2677.push(TypedReg::new(WasmValType::V128, operand).into());2678Ok(())2679}26802681fn v128_q15mulr_sat_s(2682&mut self,2683lhs: Reg,2684rhs: Reg,2685dst: WritableReg,2686size: OperandSize,2687) -> Result<()> {2688self.ensure_has_avx()?;26892690self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);26912692// Need to handle edge case of multiplying -1 by -1 (0x8000 in Q152693// format) because of how `vpmulhrs` handles rounding. `vpmulhrs`2694// produces 0x8000 in that case when the correct result is 0x7FFF (that2695// is, +1) so need to check if the result is 0x8000 and flip the bits2696// of the result if it is.2697let address = self.asm.add_constant(&[26980x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,26990x00, 0x80,2700]);2701self.asm2702.xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);2703self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);2704Ok(())2705}27062707fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {2708self.ensure_has_avx()?;27092710self.with_scratch::<FloatScratch, _>(|masm, scratch| {2711// Create a mask of all 0s.2712masm.asm2713.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());2714// Sets lane in `dst` to not zero if `src` lane was zero, and lane in2715// `dst` to zero if `src` lane was not zero.2716masm.asm2717.xmm_vpcmpeq_rrr(writable!(src), src, scratch.inner(), size);2718// Sets ZF if all values are zero (i.e., if all original values were not zero).2719masm.asm.xmm_vptest(src, src);2720// Set byte if ZF=1.2721});2722self.asm.setcc(IntCmpKind::Eq, dst);2723Ok(())2724}27252726fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {2727self.ensure_has_avx()?;27282729match size {2730OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),2731OperandSize::S16 => {2732// Signed conversion of 16-bit integers to 8-bit integers.2733self.asm2734.xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);2735// Creates a mask from each byte in `src`.2736self.asm2737.xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);2738// Removes 8 bits added as a result of the `vpackss` step.2739self.asm2740.shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);2741}2742OperandSize::S32 | OperandSize::S64 => {2743self.asm.xmm_vmovskp_rr(src, dst, size, OperandSize::S32)2744}2745_ => unimplemented!(),2746}27472748Ok(())2749}27502751fn v128_trunc(2752&mut self,2753context: &mut CodeGenContext<Emission>,2754kind: V128TruncKind,2755) -> Result<()> {2756self.ensure_has_avx()?;27572758let reg = writable!(context.pop_to_reg(self, None)?.reg);2759match kind {2760V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(2761reg.to_reg(),2762reg,2763VroundMode::TowardZero,2764kind.dst_lane_size(),2765),2766V128TruncKind::I32x4FromF32x4S => {2767self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size())?;2768}2769V128TruncKind::I32x4FromF32x4U => {2770let temp_reg = writable!(context.any_fpr(self)?);2771self.v128_trunc_sat_f32x4_u(2772reg,2773temp_reg,2774kind.src_lane_size(),2775kind.dst_lane_size(),2776)?;2777context.free_reg(temp_reg.to_reg());2778}2779V128TruncKind::I32x4FromF64x2SZero => {2780self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size())?;2781}2782V128TruncKind::I32x4FromF64x2UZero => {2783self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size())?;2784}2785}27862787context.stack.push(TypedReg::v128(reg.to_reg()).into());2788Ok(())2789}27902791fn v128_min(2792&mut self,2793src1: Reg,2794src2: Reg,2795dst: WritableReg,2796kind: V128MinKind,2797) -> Result<()> {2798self.ensure_has_avx()?;27992800match kind {2801V128MinKind::I8x16S2802| V128MinKind::I8x16U2803| V128MinKind::I16x8S2804| V128MinKind::I16x8U2805| V128MinKind::I32x4S2806| V128MinKind::I32x4U => {2807match kind {2808V128MinKind::I8x16S => {2809self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S8)2810}2811V128MinKind::I8x16U => {2812self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S8)2813}2814V128MinKind::I16x8S => {2815self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S16)2816}2817V128MinKind::I16x8U => {2818self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S16)2819}2820V128MinKind::I32x4S => {2821self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S32)2822}2823V128MinKind::I32x4U => {2824self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S32)2825}2826_ => unreachable!(),2827};2828}2829V128MinKind::F32x4 | V128MinKind::F64x2 => {2830self.with_scratch::<FloatScratch, _>(|masm, scratch| {2831// Handling +0 and -0 as well as NaN values are not commutative2832// when using `vminp` so we have to compensate.2833// Perform two comparison operations with the operands swapped2834// and OR the result to propagate 0 (positive and negative) and2835// NaN.2836masm.asm2837.xmm_vminp_rrr(src1, src2, scratch.writable(), kind.lane_size());2838masm.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());2839// Use a single OR instruction to set the sign bit if either2840// result has the sign bit set to correctly propagate -0.2841masm.asm2842.xmm_vorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());2843});2844// Set lanes with NaN to all 1s.2845self.asm.xmm_vcmpp_rrr(2846writable!(src2),2847src2,2848dst.to_reg(),2849kind.lane_size(),2850VcmpKind::Unord,2851);2852// Doesn't change non-NaN values. For NaN values, sets all bits.2853self.asm2854.xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());2855self.canonicalize_nans(writable!(src2), dst, kind.lane_size());2856}2857}28582859Ok(())2860}28612862fn v128_max(2863&mut self,2864src1: Reg,2865src2: Reg,2866dst: WritableReg,2867kind: V128MaxKind,2868) -> Result<()> {2869self.ensure_has_avx()?;28702871match kind {2872V128MaxKind::I8x16S2873| V128MaxKind::I8x16U2874| V128MaxKind::I16x8S2875| V128MaxKind::I16x8U2876| V128MaxKind::I32x4S2877| V128MaxKind::I32x4U => {2878match kind {2879V128MaxKind::I8x16S => {2880self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S8)2881}2882V128MaxKind::I8x16U => {2883self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S8)2884}2885V128MaxKind::I16x8S => {2886self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S16)2887}2888V128MaxKind::I16x8U => {2889self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S16)2890}2891V128MaxKind::I32x4S => {2892self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S32)2893}2894V128MaxKind::I32x4U => {2895self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S32)2896}2897_ => unreachable!(),2898};2899}2900V128MaxKind::F32x4 | V128MaxKind::F64x2 => {2901self.with_scratch::<FloatScratch, _>(|masm, scratch| {2902// Handling +0 and -0 as well as NaN values are not commutative2903// when using `vmaxp` so we have to compensate.2904// Perform two comparison operations with the operands swapped2905// so we can propagate 0 (positive and negative) and NaNs2906// correctly.29072908masm.asm2909.xmm_vmaxp_rrr(src1, src2, scratch.writable(), kind.lane_size());2910masm.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());2911// This combination of XOR, OR, and SUB will set the sign bit2912// on a 0 result to the correct value for a max operation.2913masm.asm2914.xmm_vxorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());2915masm.asm.xmm_vorp_rrr(2916dst.to_reg(),2917scratch.inner(),2918writable!(src2),2919kind.lane_size(),2920);2921});2922self.asm2923.xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());2924// Set lanes of NaN values to 1.2925self.asm.xmm_vcmpp_rrr(2926writable!(src2),2927src2,2928src2,2929kind.lane_size(),2930VcmpKind::Unord,2931);2932self.canonicalize_nans(writable!(src2), dst, kind.lane_size());2933}2934}2935Ok(())2936}29372938fn v128_extmul(2939&mut self,2940context: &mut CodeGenContext<Emission>,2941kind: V128ExtMulKind,2942) -> Result<()> {2943self.ensure_has_avx()?;29442945// The implementation for extmul is not optimized; for simplicity's sake, we simply perform2946// an extension followed by a multiplication using already implemented primitives.29472948let src1 = context.pop_to_reg(self, None)?;2949let src2 = context.pop_to_reg(self, None)?;29502951let ext_kind = kind.into();2952self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;2953self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;29542955context.stack.push(src2.into());2956context.stack.push(src1.into());29572958self.v128_mul(context, kind.into())2959}29602961fn v128_extadd_pairwise(2962&mut self,2963src: Reg,2964dst: WritableReg,2965kind: V128ExtAddKind,2966) -> Result<()> {2967self.ensure_has_avx()?;29682969match kind {2970V128ExtAddKind::I8x16S => {2971self.with_scratch::<FloatScratch, _>(|masm, scratch| {2972// Use `vpmaddubsw` with a vector of 16 8-bit 1's which will2973// sign extend `src` to 16 bits and add adjacent words.2974// Need to supply constant as first operand since first operand2975// is treated as unsigned and the second operand is signed.2976let mask = masm.asm.add_constant(&[1; 16]);2977masm.asm.xmm_mov_mr(2978&mask,2979scratch.writable(),2980OperandSize::S128,2981MemFlags::trusted(),2982);2983masm.asm.xmm_vpmaddubsw_rrr(scratch.inner(), src, dst);2984});2985}2986V128ExtAddKind::I8x16U => {2987// Same approach as the signed variant but treat `src` as2988// unsigned instead of signed by passing it as the first2989// operand.2990let mask = self.asm.add_constant(&[1; 16]);2991self.asm.xmm_vpmaddubsw_rmr(src, &mask, dst);2992}2993V128ExtAddKind::I16x8S => {2994// Similar approach to the two variants above. The vector is 82995// lanes of 16-bit 1's and `vpmaddwd` treats both operands as2996// signed.2997let mask = self2998.asm2999.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);3000self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);3001}3002V128ExtAddKind::I16x8U => {3003// Similar approach as the signed variant.3004// `vpmaddwd` operates on signed integers and the operand is3005// unsigned so the operand needs to be converted to a signed3006// format and than that process needs to be reversed after3007// `vpmaddwd`.3008// Flip the sign bit for 8 16-bit lanes.3009let xor_mask = self.asm.add_constant(&[30100x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,30110x80, 0x00, 0x80,3012]);3013self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);30143015let madd_mask = self3016.asm3017.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);3018self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);30193020// Reverse the XOR. The XOR effectively subtracts 32,768 from3021// both pairs that are added together so 65,536 (0x10000)3022// needs to be added to 4 lanes of 32-bit values.3023let add_mask = self3024.asm3025.add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);3026self.asm3027.xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);3028}3029}3030Ok(())3031}30323033fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {3034self.ensure_has_avx()?;3035self.asm.xmm_vpmaddwd_rrr(lhs, rhs, dst);3036Ok(())3037}30383039fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {3040self.ensure_has_avx()?;30413042let reg = writable!(context.pop_to_reg(self, None)?.reg);30433044// This works by using a lookup table to determine the count of bits3045// set in the upper 4 bits and lower 4 bits separately and then adding3046// the counts.30473048// A mask to zero out the upper 4 bits in each lane.3049let address = self.asm.add_constant(&[30500x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,30510x0F, 0x0F,3052]);30533054self.with_scratch::<FloatScratch, _>(|masm, scratch| {3055// Zero out the upper 4 bits of each lane.3056masm.asm3057.xmm_vpand_rrm(reg.to_reg(), &address, scratch.writable());3058// Right shift bytes in input by 4 bits to put the upper 4 bits in the3059// lower 4 bits.3060masm.asm3061.xmm_vpsrl_rri(reg.to_reg(), reg, 0x4, OperandSize::S16);3062// Zero out the upper 4 bits of each shifted lane.3063masm.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);30643065// Write a lookup table of 4 bit values to number of bits set to a3066// register so we only perform the memory read once.3067// Index (hex) | Value (binary) | Population Count3068// 0x0 | 0000 | 03069// 0x1 | 0001 | 13070// 0x2 | 0010 | 13071// 0x3 | 0011 | 23072// 0x4 | 0100 | 13073// 0x5 | 0101 | 23074// 0x6 | 0110 | 23075// 0x7 | 0111 | 33076// 0x8 | 1000 | 13077// 0x9 | 1001 | 23078// 0xA | 1010 | 23079// 0xB | 1011 | 33080// 0xC | 1100 | 23081// 0xD | 1101 | 33082// 0xE | 1110 | 33083// 0xF | 1111 | 43084let address = masm.asm.add_constant(&[30850x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,3086]);3087let reg2 = writable!(context.any_fpr(masm)?);3088masm.asm3089.xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());3090// Use the upper 4 bits as an index into the lookup table.3091masm.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());3092// Use the lower 4 bits as an index into the lookup table.3093masm.asm3094.xmm_vpshufb_rrr(scratch.writable(), reg2.to_reg(), scratch.inner());3095context.free_reg(reg2.to_reg());30963097// Add the counts of the upper 4 bits and the lower 4 bits to get the3098// total number of bits set.3099masm.asm3100.xmm_vpadd_rrr(reg.to_reg(), scratch.inner(), reg, OperandSize::S8);3101anyhow::Ok(())3102})?;31033104context.stack.push(TypedReg::v128(reg.to_reg()).into());3105Ok(())3106}31073108fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3109self.ensure_has_avx()?;3110self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);3111Ok(())3112}31133114fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3115self.ensure_has_avx()?;3116self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);3117Ok(())3118}31193120fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3121self.ensure_has_avx()?;3122self.asm.xmm_vsqrtp_rr(src, dst, size);3123Ok(())3124}31253126fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3127self.ensure_has_avx()?;3128self.asm3129.xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);3130Ok(())3131}31323133fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3134self.ensure_has_avx()?;3135self.asm3136.xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);3137Ok(())3138}31393140fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3141self.ensure_has_avx()?;3142self.asm3143.xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);3144Ok(())3145}31463147fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3148self.ensure_has_avx()?;3149// Reverse operands since Wasm specifies returning the first operand if3150// either operand is NaN while x86 returns the second operand.3151self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);3152Ok(())3153}31543155fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {3156self.ensure_has_avx()?;3157// Reverse operands since Wasm specifies returning the first operand if3158// either operand is NaN while x86 returns the second operand.3159self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);3160Ok(())3161}3162}31633164impl MacroAssembler {3165/// Create an x64 MacroAssembler.3166pub fn new(3167ptr_size: impl PtrSize,3168shared_flags: settings::Flags,3169isa_flags: x64_settings::Flags,3170) -> Result<Self> {3171let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size());31723173Ok(Self {3174sp_offset: 0,3175sp_max: 0,3176stack_max_use_add: None,3177asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),3178flags: isa_flags,3179shared_flags,3180ptr_size: ptr_type.try_into()?,3181scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),3182})3183}31843185/// Add the maximum stack used to a register, recording an obligation to update the3186/// add-with-immediate instruction emitted to use the real stack max when the masm is being3187/// finalized.3188fn add_stack_max(&mut self, reg: Reg) {3189assert!(self.stack_max_use_add.is_none());3190let patch = PatchableAddToReg::new(reg, OperandSize::S64, &mut self.asm);3191self.stack_max_use_add.replace(patch);3192}31933194fn ensure_has_avx(&self) -> Result<()> {3195anyhow::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);3196Ok(())3197}31983199fn ensure_has_avx2(&self) -> Result<()> {3200anyhow::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);3201Ok(())3202}32033204fn ensure_has_avx512vl(&self) -> Result<()> {3205anyhow::ensure!(3206self.flags.has_avx512vl(),3207CodeGenError::UnimplementedForNoAvx512VL3208);3209Ok(())3210}32113212fn ensure_has_avx512dq(&self) -> Result<()> {3213anyhow::ensure!(3214self.flags.has_avx512dq(),3215CodeGenError::UnimplementedForNoAvx512DQ3216);3217Ok(())3218}32193220fn increment_sp(&mut self, bytes: u32) {3221self.sp_offset += bytes;32223223// NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have3224// seen the entire function, this value will represent the maximum size for the stack3225// frame.3226self.sp_max = self.sp_max.max(self.sp_offset);3227}32283229fn decrement_sp(&mut self, bytes: u32) {3230assert!(3231self.sp_offset >= bytes,3232"sp offset = {}; bytes = {}",3233self.sp_offset,3234bytes3235);3236self.sp_offset -= bytes;3237}32383239fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {3240match constant {3241I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),3242I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),3243I::F32(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),3244I::F64(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),3245I::V128(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),3246}3247}32483249/// A common implementation for zero-extend stack loads.3250fn load_impl(3251&mut self,3252src: Address,3253dst: WritableReg,3254size: OperandSize,3255flags: MemFlags,3256) -> Result<()> {3257if dst.to_reg().is_int() {3258let ext = size.extend_to::<Zero>(OperandSize::S64);3259self.asm.movzx_mr(&src, dst, ext, flags);3260} else {3261self.asm.xmm_mov_mr(&src, dst, size, flags);3262}32633264Ok(())3265}32663267/// A common implementation for stack stores.3268fn store_impl(3269&mut self,3270src: RegImm,3271dst: Address,3272size: OperandSize,3273flags: MemFlags,3274) -> Result<()> {3275let _ = match src {3276RegImm::Imm(imm) => match imm {3277I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),3278I::I64(v) => match v.try_into() {3279Ok(v) => self.asm.mov_im(v, &dst, size, flags),3280Err(_) => {3281// If the immediate doesn't sign extend, use a scratch3282// register.3283self.with_scratch::<IntScratch, _>(|masm, scratch| {3284masm.asm.mov_ir(v, scratch.writable(), size);3285masm.asm.mov_rm(scratch.inner(), &dst, size, flags);3286});3287}3288},3289I::F32(v) => {3290let addr = self.asm.add_constant(v.to_le_bytes().as_slice());3291self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {3292// Always trusted, since we are loading the constant from3293// the constant pool.3294masm.asm.xmm_mov_mr(3295&addr,3296float_scratch.writable(),3297size,3298MemFlags::trusted(),3299);3300masm.asm3301.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);3302});3303}3304I::F64(v) => {3305let addr = self.asm.add_constant(v.to_le_bytes().as_slice());33063307self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {3308// Similar to above, always trusted since we are loading the3309// constant from the constant pool.3310masm.asm.xmm_mov_mr(3311&addr,3312float_scratch.writable(),3313size,3314MemFlags::trusted(),3315);3316masm.asm3317.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);3318});3319}3320I::V128(v) => {3321let addr = self.asm.add_constant(v.to_le_bytes().as_slice());3322self.with_scratch::<FloatScratch, _>(|masm, vector_scratch| {3323// Always trusted, since we are loading the constant from3324// the constant pool.3325masm.asm.xmm_mov_mr(3326&addr,3327vector_scratch.writable(),3328size,3329MemFlags::trusted(),3330);3331masm.asm3332.xmm_mov_rm(vector_scratch.inner(), &dst, size, flags);3333});3334}3335},3336RegImm::Reg(reg) => {3337if reg.is_int() {3338self.asm.mov_rm(reg, &dst, size, flags);3339} else {3340self.asm.xmm_mov_rm(reg, &dst, size, flags);3341}3342}3343};3344Ok(())3345}33463347fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {3348if dst != lhs {3349Err(anyhow!(CodeGenError::invalid_two_arg_form()))3350} else {3351Ok(())3352}3353}33543355/// The mask to use when performing a `vpshuf` operation for a 64-bit splat.3356fn vpshuf_mask_for_64_bit_splats() -> u8 {3357// Results in the first 4 bytes and second 4 bytes being3358// swapped and then the swapped bytes being copied.3359// [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields3360// [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3].33610b01_00_01_003362}33633364fn v128_trunc_sat_f32x4_s(3365&mut self,3366reg: WritableReg,3367src_lane_size: OperandSize,3368dst_lane_size: OperandSize,3369) -> Result<()> {3370self.with_scratch::<FloatScratch, _>(|masm, scratch| {3371// Create a mask to handle NaN values (1 for not NaN, 0 for3372// NaN).3373masm.asm.xmm_vcmpp_rrr(3374scratch.writable(),3375reg.to_reg(),3376reg.to_reg(),3377src_lane_size,3378VcmpKind::Eq,3379);3380// Zero out any NaN values.3381masm.asm3382.xmm_vandp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3383// Create a mask for the sign bits.3384masm.asm3385.xmm_vpxor_rrr(scratch.inner(), reg.to_reg(), scratch.writable());3386// Convert floats to integers.3387masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);3388// Apply sign mask to the converted integers.3389masm.asm3390.xmm_vpand_rrr(reg.to_reg(), scratch.inner(), scratch.writable());3391// Create a saturation mask of all 1s for negative numbers,3392// all 0s for positive numbers. The arithmetic shift will cop3393// the sign bit.3394masm.asm3395.xmm_vpsra_rri(scratch.inner(), scratch.writable(), 0x1F, dst_lane_size);3396// Combine converted integers with saturation mask.3397masm.asm.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), reg);3398Ok(())3399})3400}34013402fn v128_trunc_sat_f32x4_u(3403&mut self,3404reg: WritableReg,3405temp_reg: WritableReg,3406src_lane_size: OperandSize,3407dst_lane_size: OperandSize,3408) -> Result<()> {3409self.with_scratch::<FloatScratch, _>(|masm, scratch| {3410// Set scratch to all zeros.3411masm.asm.xmm_vxorp_rrr(3412reg.to_reg(),3413reg.to_reg(),3414scratch.writable(),3415src_lane_size,3416);3417// Clamp negative numbers to 0.3418masm.asm3419.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3420// Create a vector of all 1s.3421masm.asm.xmm_vpcmpeq_rrr(3422scratch.writable(),3423scratch.inner(),3424scratch.inner(),3425src_lane_size,3426);3427// Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by3428// performing a logical shift right.3429masm.asm3430.xmm_vpsrl_rri(scratch.inner(), scratch.writable(), 0x1, src_lane_size);3431// Convert max signed int to float as a reference point for saturation.3432masm.asm3433.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);3434// Convert the floats to integers and put the results in `reg2`.3435// This is signed and not unsigned so we need to handle the3436// value for the high bit in each lane.3437masm.asm3438.xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);3439// Set `reg` lanes to the amount that the value in the lane3440// exceeds the maximum signed 32-bit integer.3441masm.asm3442.xmm_vsub_rrr(reg.to_reg(), scratch.inner(), reg, dst_lane_size);3443// Create mask in `scratch` for numbers that are larger than3444// the maximum signed 32-bit integer. Lanes that don't fit3445// in 32-bits ints will be 1.3446masm.asm.xmm_vcmpp_rrr(3447scratch.writable(),3448scratch.inner(),3449reg.to_reg(),3450dst_lane_size,3451VcmpKind::Le,3452);3453// Convert the excess over signed 32-bits from floats to integers.3454masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);3455// Apply large number mask to excess values which will flip the3456// bits in any lanes that exceed signed 32-bits. Adding this3457// flipped value to the signed value will set the high bit and3458// the carry behavior will update the other bits correctly.3459masm.asm3460.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), scratch.writable());3461// Set `reg` to all 0s.3462masm.asm.xmm_vpxor_rrr(reg.to_reg(), reg.to_reg(), reg);3463// Ensure excess values are not negative by taking max b/w3464// excess values and zero.3465masm.asm3466.xmm_vpmaxs_rrr(reg, scratch.inner(), reg.to_reg(), dst_lane_size);3467});3468// Perform the addition between the signed conversion value (in3469// `reg2`) and the flipped excess value (in `reg`) to get the3470// unsigned value.3471self.asm3472.xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);3473Ok(())3474}34753476fn v128_trunc_sat_f64x2_s_zero(3477&mut self,3478reg: WritableReg,3479src_lane_size: OperandSize,3480) -> Result<()> {3481self.with_scratch::<FloatScratch, _>(|masm, scratch| {3482// Create a NaN mask (1s for non-NaN, 0s for NaN).3483masm.asm.xmm_vcmpp_rrr(3484scratch.writable(),3485reg.to_reg(),3486reg.to_reg(),3487src_lane_size,3488VcmpKind::Eq,3489);3490// Clamp NaN values to maximum 64-bit float that can be3491// converted to an i32.3492let address = masm.asm.add_constant(&[34930x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,34940xDF, 0x41,3495]);3496masm.asm3497.xmm_vandp_rrm(scratch.inner(), &address, scratch.writable(), src_lane_size);3498// Handle the saturation for values too large to fit in an i32.3499masm.asm3500.xmm_vminp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3501// Convert the floats to integers.3502masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);35033504Ok(())3505})3506}35073508fn v128_trunc_sat_f64x2_u_zero(3509&mut self,3510reg: WritableReg,3511src_lane_size: OperandSize,3512dst_lane_size: OperandSize,3513) -> Result<()> {3514self.with_scratch::<FloatScratch, _>(|masm, scratch| {3515// Zero out the scratch register.3516masm.asm.xmm_vxorp_rrr(3517scratch.inner(),3518scratch.inner(),3519scratch.writable(),3520src_lane_size,3521);3522// Clamp negative values to zero.3523masm.asm3524.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);3525// Clamp value to maximum unsigned 32-bit integer value3526// (0x41F0000000000000).3527let address = masm.asm.add_constant(&[35280x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,35290xEF, 0x41,3530]);3531masm.asm3532.xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);3533// Truncate floating point values.3534masm.asm3535.xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);3536// Add 2^52 (doubles store 52 bits in their mantissa) to each3537// lane causing values in the lower bits to be shifted into3538// position for integer conversion.3539let address = masm.asm.add_constant(&[35400x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,35410x30, 0x43,3542]);3543masm.asm3544.xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);3545// Takes lanes 0 and 2 from `reg` (converted values) and lanes3546// 0 and 2 from `scratch` (zeroes) to put the converted ints in3547// the lower lanes and zeroes in the upper lanes.3548masm.asm.xmm_vshufp_rrri(3549reg.to_reg(),3550scratch.inner(),3551reg,35520b10_00_10_00,3553dst_lane_size,3554);3555Ok(())3556})3557}35583559/// Given a vector of floats where lanes with NaN values are set to all 1s3560/// in `reg` and a vector register `dst` with a mix of non-NaN values and3561/// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`.3562fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {3563// Canonical NaNs do not preserve the sign bit, have the exponent bits3564// all set, and have only the high bit of the mantissa set so shift by3565// that number.3566// The mask we're producing in this step will be inverted in the next3567// step.3568let amount_to_shift = 1 + size.mantissa_bits() + 1;3569self.asm3570.xmm_vpsrl_rri(mask.to_reg(), mask, amount_to_shift as u32, size);3571// The mask will be inverted by the ANDN so non-NaN values will be all3572// 1s and NaN values will set the sign bit, exponent bits, and zero out3573// almost all of the mantissa.3574self.asm3575.xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);3576}3577}357835793580