Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bytecodealliance
GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/winch/codegen/src/isa/x64/masm.rs
1693 views
1
use super::{
2
RegAlloc,
3
abi::X64ABI,
4
address::Address,
5
asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},
6
regs::{self, rbp, rsp, scratch_fpr_bitset, scratch_gpr_bitset},
7
};
8
use anyhow::{Result, anyhow, bail};
9
10
use crate::masm::{
11
DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, FloatScratch, Imm as I, IntCmpKind,
12
IntScratch, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm,
13
RemKind, ReplaceLaneKind, RmwOp, RoundingMode, Scratch, ScratchType, ShiftKind, SplatKind,
14
StoreKind, TRUSTED_FLAGS, TrapCode, TruncKind, UNTRUSTED_FLAGS, V128AbsKind, V128AddKind,
15
V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, V128MaxKind, V128MinKind,
16
V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind, VectorCompareKind,
17
VectorEqualityKind, Zero,
18
};
19
use crate::{
20
abi::{self, LocalSlot, align_to, calculate_frame_adjustment},
21
codegen::{CodeGenContext, CodeGenError, Emission, FuncEnv, ptr_type_from_ptr_size},
22
stack::{TypedReg, Val},
23
};
24
use crate::{
25
abi::{ABI, vmctx},
26
masm::{SPOffset, StackSlot},
27
};
28
use crate::{
29
isa::{
30
CallingConvention,
31
reg::{Reg, RegClass, WritableReg, writable},
32
},
33
masm::CalleeKind,
34
};
35
use cranelift_codegen::{
36
Final, MachBufferFinalized, MachLabel,
37
binemit::CodeOffset,
38
ir::{MemFlags, RelSourceLoc, SourceLoc},
39
isa::{
40
unwind::UnwindInst,
41
x64::{AtomicRmwSeqOp, args::CC, settings as x64_settings},
42
},
43
settings,
44
};
45
use wasmtime_cranelift::TRAP_UNREACHABLE;
46
use wasmtime_environ::{PtrSize, WasmValType};
47
48
// Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`
49
// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
50
// need to fix up the bits that migrate from one half of the lane to the
51
// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
52
// right by 0 (no movement), we want to retain all the bits so we mask with
53
// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
54
// we mask with `0x7f`; etc.
55
56
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
57
const I8X16_ISHL_MASKS: [u8; 128] = [
58
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
59
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
60
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
61
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
62
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
63
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
64
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
65
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
66
];
67
68
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
69
const I8X16_USHR_MASKS: [u8; 128] = [
70
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
72
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
73
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
74
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
75
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
76
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
77
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
78
];
79
80
/// x64 MacroAssembler.
81
pub(crate) struct MacroAssembler {
82
/// Stack pointer offset.
83
sp_offset: u32,
84
/// This value represents the maximum stack size seen while compiling the function. While the
85
/// function is still being compiled its value will not be valid (the stack will grow and
86
/// shrink as space is reserved and freed during compilation), but once all instructions have
87
/// been seen this value will be the maximum stack usage seen.
88
sp_max: u32,
89
/// Add instructions that are used to add the constant stack max to a register.
90
stack_max_use_add: Option<PatchableAddToReg>,
91
/// Low level assembler.
92
asm: Assembler,
93
/// ISA flags.
94
flags: x64_settings::Flags,
95
/// Shared flags.vmcontext_store_context
96
shared_flags: settings::Flags,
97
/// The target pointer size.
98
ptr_size: OperandSize,
99
/// Scratch register scope.
100
scratch_scope: RegAlloc,
101
}
102
103
impl Masm for MacroAssembler {
104
type Address = Address;
105
type Ptr = u8;
106
type ABI = X64ABI;
107
108
fn frame_setup(&mut self) -> Result<()> {
109
let frame_pointer = rbp();
110
let stack_pointer = rsp();
111
112
self.asm.push_r(frame_pointer);
113
114
if self.shared_flags.unwind_info() {
115
self.asm.unwind_inst(UnwindInst::PushFrameRegs {
116
offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
117
})
118
}
119
120
self.asm
121
.mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);
122
123
Ok(())
124
}
125
126
fn check_stack(&mut self, vmctx: Reg) -> Result<()> {
127
let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();
128
129
self.with_scratch::<IntScratch, _>(|masm, scratch| {
130
masm.load_ptr(
131
masm.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,
132
scratch.writable(),
133
)?;
134
135
masm.load_ptr(
136
Address::offset(
137
scratch.inner(),
138
ptr_size.vmstore_context_stack_limit().into(),
139
),
140
scratch.writable(),
141
)?;
142
143
masm.add_stack_max(scratch.inner());
144
145
masm.asm.cmp_rr(scratch.inner(), regs::rsp(), masm.ptr_size);
146
masm.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);
147
anyhow::Ok(())
148
})?;
149
150
// Emit unwind info.
151
if self.shared_flags.unwind_info() {
152
self.asm.unwind_inst(UnwindInst::DefineNewFrame {
153
offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
154
155
// The Winch calling convention has no callee-save registers, so nothing will be
156
// clobbered.
157
offset_downward_to_clobbers: 0,
158
})
159
}
160
Ok(())
161
}
162
163
fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {
164
let bytes = match (reg.class(), size) {
165
(RegClass::Int, OperandSize::S64) => {
166
let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;
167
self.asm.push_r(reg);
168
self.increment_sp(word_bytes);
169
word_bytes
170
}
171
(RegClass::Int, OperandSize::S32) => {
172
let bytes = size.bytes();
173
self.reserve_stack(bytes)?;
174
let sp_offset = SPOffset::from_u32(self.sp_offset);
175
self.asm
176
.mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
177
bytes
178
}
179
(RegClass::Float, _) => {
180
let bytes = size.bytes();
181
self.reserve_stack(bytes)?;
182
let sp_offset = SPOffset::from_u32(self.sp_offset);
183
self.asm
184
.xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
185
bytes
186
}
187
_ => unreachable!(),
188
};
189
190
Ok(StackSlot {
191
offset: SPOffset::from_u32(self.sp_offset),
192
size: bytes,
193
})
194
}
195
196
fn reserve_stack(&mut self, bytes: u32) -> Result<()> {
197
if bytes == 0 {
198
return Ok(());
199
}
200
201
self.asm
202
.sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
203
self.increment_sp(bytes);
204
205
Ok(())
206
}
207
208
fn free_stack(&mut self, bytes: u32) -> Result<()> {
209
if bytes == 0 {
210
return Ok(());
211
}
212
self.asm
213
.add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
214
self.decrement_sp(bytes);
215
216
Ok(())
217
}
218
219
fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {
220
self.sp_offset = offset.as_u32();
221
222
Ok(())
223
}
224
225
fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {
226
let (reg, offset) = if local.addressed_from_sp() {
227
let offset = self
228
.sp_offset
229
.checked_sub(local.offset)
230
.ok_or_else(|| CodeGenError::invalid_local_offset())?;
231
(rsp(), offset)
232
} else {
233
(rbp(), local.offset)
234
};
235
236
Ok(Address::offset(reg, offset))
237
}
238
239
fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {
240
Ok(Address::offset(
241
regs::rsp(),
242
self.sp_offset - offset.as_u32(),
243
))
244
}
245
246
fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {
247
Ok(Address::offset(regs::rsp(), offset.as_u32()))
248
}
249
250
fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {
251
Ok(Address::offset(vmctx!(Self), offset))
252
}
253
254
fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {
255
self.store(src.into(), dst, self.ptr_size)
256
}
257
258
fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {
259
self.store_impl(src, dst, size, TRUSTED_FLAGS)
260
}
261
262
fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {
263
match kind {
264
StoreKind::Operand(size) => {
265
self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
266
}
267
StoreKind::Atomic(size) => {
268
if size == OperandSize::S128 {
269
// TODO: we don't support 128-bit atomic store yet.
270
bail!(CodeGenError::unexpected_operand_size());
271
}
272
// To stay consistent with cranelift, we emit a normal store followed by a mfence,
273
// although, we could probably just emit a xchg.
274
self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
275
self.asm.mfence();
276
}
277
StoreKind::VectorLane(LaneSelector { lane, size }) => {
278
self.ensure_has_avx()?;
279
self.asm
280
.xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS);
281
}
282
}
283
284
Ok(())
285
}
286
287
fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
288
let current_sp = SPOffset::from_u32(self.sp_offset);
289
let _ = match (dst.to_reg().class(), size) {
290
(RegClass::Int, OperandSize::S32) => {
291
let addr = self.address_from_sp(current_sp)?;
292
self.asm.movzx_mr(
293
&addr,
294
dst,
295
size.extend_to::<Zero>(OperandSize::S64),
296
TRUSTED_FLAGS,
297
);
298
self.free_stack(size.bytes())?;
299
}
300
(RegClass::Int, OperandSize::S64) => {
301
self.asm.pop_r(dst);
302
self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);
303
}
304
(RegClass::Float, _) | (RegClass::Vector, _) => {
305
let addr = self.address_from_sp(current_sp)?;
306
self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
307
self.free_stack(size.bytes())?;
308
}
309
_ => bail!(CodeGenError::invalid_operand_combination()),
310
};
311
Ok(())
312
}
313
314
fn with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R {
315
let r = self
316
.scratch_scope
317
.reg_for_class(T::reg_class(), &mut |_| Ok(()))
318
.expect("Scratch register to be available");
319
320
let ret = f(self, Scratch::new(r));
321
self.scratch_scope.free(r);
322
ret
323
}
324
325
fn call(
326
&mut self,
327
stack_args_size: u32,
328
mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,
329
) -> Result<u32> {
330
let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();
331
let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();
332
let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);
333
let aligned_args_size = align_to(stack_args_size, alignment);
334
let total_stack = delta + aligned_args_size;
335
self.reserve_stack(total_stack)?;
336
let (callee, cc) = load_callee(self)?;
337
match callee {
338
CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),
339
CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),
340
};
341
Ok(total_stack)
342
}
343
344
fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {
345
self.load(src, dst, self.ptr_size)
346
}
347
348
fn compute_addr(
349
&mut self,
350
src: Self::Address,
351
dst: WritableReg,
352
size: OperandSize,
353
) -> Result<()> {
354
self.asm.lea(&src, dst, size);
355
Ok(())
356
}
357
358
fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {
359
self.load_impl(src, dst, size, TRUSTED_FLAGS)
360
}
361
362
fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {
363
let size = kind.derive_operand_size();
364
365
match kind {
366
LoadKind::ScalarExtend(ext) => match ext {
367
ExtendKind::Signed(ext) => {
368
self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);
369
}
370
ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,
371
},
372
LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {
373
// The guarantees of the x86-64 memory model ensure that `SeqCst`
374
// loads are equivalent to normal loads.
375
if kind.is_atomic() && size == OperandSize::S128 {
376
bail!(CodeGenError::unexpected_operand_size());
377
}
378
379
self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;
380
}
381
LoadKind::VectorExtend(ext) => {
382
self.ensure_has_avx()?;
383
self.asm
384
.xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)
385
}
386
LoadKind::Splat(_) => {
387
self.ensure_has_avx()?;
388
389
if size == OperandSize::S64 {
390
self.asm
391
.xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);
392
self.asm.xmm_vpshuf_rr(
393
dst.to_reg(),
394
dst,
395
Self::vpshuf_mask_for_64_bit_splats(),
396
OperandSize::S32,
397
);
398
} else {
399
self.asm
400
.xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);
401
}
402
}
403
LoadKind::VectorLane(LaneSelector { lane, size }) => {
404
self.ensure_has_avx()?;
405
self.with_scratch::<IntScratch, _>(|masm, byte_tmp| {
406
masm.load_impl(src, byte_tmp.writable(), size, UNTRUSTED_FLAGS)?;
407
masm.asm
408
.xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp.inner(), lane, size);
409
anyhow::Ok(())
410
})?;
411
}
412
LoadKind::VectorZero(size) => {
413
self.ensure_has_avx()?;
414
self.with_scratch::<IntScratch, _>(|masm, scratch| {
415
masm.load_impl(src, scratch.writable(), size, UNTRUSTED_FLAGS)?;
416
masm.asm.avx_gpr_to_xmm(scratch.inner(), dst, size);
417
anyhow::Ok(())
418
})?;
419
}
420
}
421
422
Ok(())
423
}
424
425
fn sp_offset(&self) -> Result<SPOffset> {
426
Ok(SPOffset::from_u32(self.sp_offset))
427
}
428
429
fn zero(&mut self, reg: WritableReg) -> Result<()> {
430
self.asm.xor_rr(
431
reg.to_reg(),
432
reg,
433
OperandSize::from_bytes(<Self::ABI>::word_bytes()),
434
);
435
Ok(())
436
}
437
438
fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {
439
match (src, dst.to_reg()) {
440
(RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {
441
(RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),
442
(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),
443
_ => bail!(CodeGenError::invalid_operand_combination()),
444
},
445
(RegImm::Imm(imm), _) => self.load_constant(&imm, dst, size),
446
}
447
}
448
449
fn cmov(
450
&mut self,
451
dst: WritableReg,
452
src: Reg,
453
cc: IntCmpKind,
454
size: OperandSize,
455
) -> Result<()> {
456
match (src.class(), dst.to_reg().class()) {
457
(RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),
458
(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),
459
_ => Err(anyhow!(CodeGenError::invalid_operand_combination())),
460
}
461
}
462
463
fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
464
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
465
match (rhs, dst) {
466
(RegImm::Imm(imm), _) => {
467
if let Some(v) = imm.to_i32() {
468
self.asm.add_ir(v, dst, size);
469
} else {
470
self.with_scratch::<IntScratch, _>(|masm, scratch| {
471
masm.load_constant(&imm, scratch.writable(), size)?;
472
masm.asm.add_rr(scratch.inner(), dst, size);
473
anyhow::Ok(())
474
})?;
475
}
476
}
477
478
(RegImm::Reg(src), dst) => {
479
self.asm.add_rr(src, dst, size);
480
}
481
}
482
483
Ok(())
484
}
485
486
fn checked_uadd(
487
&mut self,
488
dst: WritableReg,
489
lhs: Reg,
490
rhs: RegImm,
491
size: OperandSize,
492
trap: TrapCode,
493
) -> Result<()> {
494
self.add(dst, lhs, rhs, size)?;
495
self.asm.trapif(CC::B, trap);
496
Ok(())
497
}
498
499
fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
500
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
501
match (rhs, dst) {
502
(RegImm::Imm(imm), reg) => {
503
if let Some(v) = imm.to_i32() {
504
self.asm.sub_ir(v, reg, size);
505
} else {
506
self.with_scratch::<IntScratch, _>(|masm, scratch| {
507
masm.load_constant(&imm, scratch.writable(), size)?;
508
masm.asm.sub_rr(scratch.inner(), reg, size);
509
anyhow::Ok(())
510
})?;
511
}
512
}
513
514
(RegImm::Reg(src), dst) => {
515
self.asm.sub_rr(src, dst, size);
516
}
517
}
518
519
Ok(())
520
}
521
522
fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
523
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
524
match (rhs, dst) {
525
(RegImm::Imm(imm), _) => {
526
if let Some(v) = imm.to_i32() {
527
self.asm.mul_ir(v, dst, size);
528
} else {
529
self.with_scratch::<IntScratch, _>(|masm, scratch| {
530
masm.load_constant(&imm, scratch.writable(), size)?;
531
masm.asm.mul_rr(scratch.inner(), dst, size);
532
anyhow::Ok(())
533
})?;
534
}
535
}
536
537
(RegImm::Reg(src), dst) => {
538
self.asm.mul_rr(src, dst, size);
539
}
540
}
541
542
Ok(())
543
}
544
545
fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
546
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
547
self.asm.xmm_add_rr(rhs, dst, size);
548
Ok(())
549
}
550
551
fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
552
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
553
self.asm.xmm_sub_rr(rhs, dst, size);
554
Ok(())
555
}
556
557
fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
558
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
559
self.asm.xmm_mul_rr(rhs, dst, size);
560
Ok(())
561
}
562
563
fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
564
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
565
self.asm.xmm_div_rr(rhs, dst, size);
566
Ok(())
567
}
568
569
fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
570
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
571
self.asm.xmm_min_seq(rhs, dst, size);
572
Ok(())
573
}
574
575
fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
576
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
577
self.asm.xmm_max_seq(rhs, dst, size);
578
Ok(())
579
}
580
581
fn float_copysign(
582
&mut self,
583
dst: WritableReg,
584
lhs: Reg,
585
rhs: Reg,
586
size: OperandSize,
587
) -> Result<()> {
588
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
589
let sign_mask = match size {
590
OperandSize::S32 => I::I32(0x80000000),
591
OperandSize::S64 => I::I64(0x8000000000000000),
592
OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
593
bail!(CodeGenError::unexpected_operand_size())
594
}
595
};
596
597
self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
598
masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
599
masm.load_constant(&sign_mask, scratch_gpr.writable(), size)?;
600
masm.asm
601
.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
602
603
// Clear everything except sign bit in src.
604
masm.asm
605
.xmm_and_rr(scratch_xmm.inner(), writable!(rhs), size);
606
607
// Clear sign bit in dst using scratch to store result. Then copy the
608
// result back to dst.
609
masm.asm
610
.xmm_andn_rr(dst.to_reg(), scratch_xmm.writable(), size);
611
masm.asm.xmm_mov_rr(scratch_xmm.inner(), dst, size);
612
613
// Copy sign bit from src to dst.
614
masm.asm.xmm_or_rr(rhs, dst, size);
615
Ok(())
616
})
617
})
618
}
619
620
fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
621
debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
622
let mask = match size {
623
OperandSize::S32 => I::I32(0x80000000),
624
OperandSize::S64 => I::I64(0x8000000000000000),
625
OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
626
bail!(CodeGenError::unexpected_operand_size())
627
}
628
};
629
self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
630
masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
631
masm.load_constant(&mask, scratch_gpr.writable(), size)?;
632
masm.asm
633
.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
634
masm.asm.xmm_xor_rr(scratch_xmm.inner(), dst, size);
635
Ok(())
636
})
637
})
638
}
639
640
fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
641
debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
642
let mask = match size {
643
OperandSize::S32 => I::I32(0x7fffffff),
644
OperandSize::S64 => I::I64(0x7fffffffffffffff),
645
OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {
646
bail!(CodeGenError::unexpected_operand_size())
647
}
648
};
649
650
self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
651
masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
652
masm.load_constant(&mask, scratch_gpr.writable(), size)?;
653
654
masm.asm
655
.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
656
masm.asm.xmm_and_rr(scratch_xmm.inner(), dst, size);
657
Ok(())
658
})
659
})
660
}
661
662
fn float_round<
663
F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,
664
>(
665
&mut self,
666
mode: RoundingMode,
667
env: &mut FuncEnv<Self::Ptr>,
668
context: &mut CodeGenContext<Emission>,
669
size: OperandSize,
670
mut fallback: F,
671
) -> Result<()> {
672
if self.flags.has_sse41() {
673
let src = context.pop_to_reg(self, None)?;
674
self.asm
675
.xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);
676
context.stack.push(src.into());
677
Ok(())
678
} else {
679
fallback(env, context, self)
680
}
681
}
682
683
fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
684
self.asm.sqrt(src, dst, size);
685
Ok(())
686
}
687
688
fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
689
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
690
match (rhs, dst) {
691
(RegImm::Imm(imm), _) => {
692
if let Some(v) = imm.to_i32() {
693
self.asm.and_ir(v, dst, size);
694
} else {
695
self.with_scratch::<IntScratch, _>(|masm, scratch| {
696
masm.load_constant(&imm, scratch.writable(), size)?;
697
masm.asm.and_rr(scratch.inner(), dst, size);
698
anyhow::Ok(())
699
})?;
700
}
701
}
702
703
(RegImm::Reg(src), dst) => {
704
self.asm.and_rr(src, dst, size);
705
}
706
}
707
708
Ok(())
709
}
710
711
fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
712
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
713
match (rhs, dst) {
714
(RegImm::Imm(imm), _) => {
715
if let Some(v) = imm.to_i32() {
716
self.asm.or_ir(v, dst, size);
717
} else {
718
self.with_scratch::<IntScratch, _>(|masm, scratch| {
719
masm.load_constant(&imm, scratch.writable(), size)?;
720
masm.asm.or_rr(scratch.inner(), dst, size);
721
anyhow::Ok(())
722
})?;
723
}
724
}
725
726
(RegImm::Reg(src), dst) => {
727
self.asm.or_rr(src, dst, size);
728
}
729
}
730
731
Ok(())
732
}
733
734
fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
735
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
736
match (rhs, dst) {
737
(RegImm::Imm(imm), _) => {
738
if let Some(v) = imm.to_i32() {
739
self.asm.xor_ir(v, dst, size);
740
} else {
741
self.with_scratch::<IntScratch, _>(|masm, scratch| {
742
masm.load_constant(&imm, scratch.writable(), size)?;
743
masm.asm.xor_rr(scratch.inner(), dst, size);
744
anyhow::Ok(())
745
})?;
746
}
747
}
748
749
(RegImm::Reg(src), _) => {
750
self.asm.xor_rr(src, dst, size);
751
}
752
}
753
754
Ok(())
755
}
756
757
fn shift_ir(
758
&mut self,
759
dst: WritableReg,
760
imm: I,
761
lhs: Reg,
762
kind: ShiftKind,
763
size: OperandSize,
764
) -> Result<()> {
765
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
766
self.asm
767
.shift_ir(imm.unwrap_as_u64() as u8, dst, kind, size);
768
Ok(())
769
}
770
771
fn shift(
772
&mut self,
773
context: &mut CodeGenContext<Emission>,
774
kind: ShiftKind,
775
size: OperandSize,
776
) -> Result<()> {
777
// Number of bits to shift must be in the CL register.
778
let src = context.pop_to_reg(self, Some(regs::rcx()))?;
779
let dst = context.pop_to_reg(self, None)?;
780
781
self.asm
782
.shift_rr(src.into(), writable!(dst.into()), kind, size);
783
784
context.free_reg(src);
785
context.stack.push(dst.into());
786
787
Ok(())
788
}
789
790
fn div(
791
&mut self,
792
context: &mut CodeGenContext<Emission>,
793
kind: DivKind,
794
size: OperandSize,
795
) -> Result<()> {
796
// Allocate rdx:rax.
797
let rdx = context.reg(regs::rdx(), self)?;
798
let rax = context.reg(regs::rax(), self)?;
799
800
// Allocate the divisor, which can be any gpr.
801
let divisor = context.pop_to_reg(self, None)?;
802
803
// Mark rax as allocatable.
804
context.free_reg(rax);
805
// Move the top value to rax.
806
let rax = context.pop_to_reg(self, Some(rax))?;
807
self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);
808
809
// Free the divisor and rdx.
810
context.free_reg(divisor);
811
context.free_reg(rdx);
812
813
// Push the quotient.
814
context.stack.push(rax.into());
815
Ok(())
816
}
817
818
fn rem(
819
&mut self,
820
context: &mut CodeGenContext<Emission>,
821
kind: RemKind,
822
size: OperandSize,
823
) -> Result<()> {
824
// Allocate rdx:rax.
825
let rdx = context.reg(regs::rdx(), self)?;
826
let rax = context.reg(regs::rax(), self)?;
827
828
// Allocate the divisor, which can be any gpr.
829
let divisor = context.pop_to_reg(self, None)?;
830
831
// Mark rax as allocatable.
832
context.free_reg(rax);
833
// Move the top value to rax.
834
let rax = context.pop_to_reg(self, Some(rax))?;
835
self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);
836
837
// Free the divisor and rax.
838
context.free_reg(divisor);
839
context.free_reg(rax);
840
841
// Push the remainder.
842
context.stack.push(Val::reg(rdx, divisor.ty));
843
844
Ok(())
845
}
846
847
fn frame_restore(&mut self) -> Result<()> {
848
debug_assert_eq!(self.sp_offset, 0);
849
self.asm.pop_r(writable!(rbp()));
850
self.asm.ret();
851
Ok(())
852
}
853
854
fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {
855
if let Some(patch) = self.stack_max_use_add {
856
patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());
857
}
858
859
Ok(self.asm.finalize(base))
860
}
861
862
fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {
863
Ok(Address::offset(reg, offset))
864
}
865
866
fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {
867
match src2 {
868
RegImm::Imm(imm) => {
869
if let Some(v) = imm.to_i32() {
870
self.asm.cmp_ir(src1, v, size);
871
} else {
872
self.with_scratch::<IntScratch, _>(|masm, scratch| {
873
masm.load_constant(&imm, scratch.writable(), size)?;
874
masm.asm.cmp_rr(src1, scratch.inner(), size);
875
anyhow::Ok(())
876
})?;
877
}
878
}
879
RegImm::Reg(src2) => {
880
self.asm.cmp_rr(src1, src2, size);
881
}
882
}
883
884
Ok(())
885
}
886
887
fn cmp_with_set(
888
&mut self,
889
dst: WritableReg,
890
src: RegImm,
891
kind: IntCmpKind,
892
size: OperandSize,
893
) -> Result<()> {
894
self.cmp(dst.to_reg(), src, size)?;
895
self.asm.setcc(kind, dst);
896
Ok(())
897
}
898
899
fn float_cmp_with_set(
900
&mut self,
901
dst: WritableReg,
902
src1: Reg,
903
src2: Reg,
904
kind: FloatCmpKind,
905
size: OperandSize,
906
) -> Result<()> {
907
// Float comparisons needs to be ordered (that is, comparing with a NaN
908
// should return 0) except for not equal which needs to be unordered.
909
// We use ucomis{s, d} because comis{s, d} has an undefined result if
910
// either operand is NaN. Since ucomis{s, d} is unordered, we need to
911
// compensate to make the comparison ordered. Ucomis{s, d} sets the
912
// ZF, PF, and CF flags if there is an unordered result.
913
let (src1, src2, set_kind) = match kind {
914
FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),
915
FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),
916
FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),
917
FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),
918
// Reversing the operands and using the complementary comparison
919
// avoids needing to perform an additional SETNP and AND
920
// instruction.
921
// SETNB and SETNBE check if the carry flag is unset (i.e., not
922
// less than and not unordered) so we get the intended result
923
// without having to look at the parity flag.
924
FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),
925
FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),
926
};
927
self.asm.ucomis(src1, src2, size);
928
self.asm.setcc(set_kind, dst);
929
let _ = match kind {
930
FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {
931
// Return false if either operand is NaN by ensuring PF is
932
// unset.
933
self.with_scratch::<IntScratch, _>(|masm, scratch| {
934
masm.asm.setnp(scratch.writable());
935
masm.asm.and_rr(scratch.inner(), dst, size);
936
});
937
}
938
FloatCmpKind::Ne => {
939
// Return true if either operand is NaN by checking if PF is
940
// set.
941
self.with_scratch::<IntScratch, _>(|masm, scratch| {
942
masm.asm.setp(scratch.writable());
943
masm.asm.or_rr(scratch.inner(), dst, size);
944
});
945
}
946
FloatCmpKind::Lt | FloatCmpKind::Le => (),
947
};
948
Ok(())
949
}
950
951
fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
952
if self.flags.has_lzcnt() {
953
self.asm.lzcnt(src, dst, size);
954
} else {
955
self.with_scratch::<IntScratch, _>(|masm, scratch| {
956
// Use the following approach:
957
// dst = size.num_bits() - bsr(src) - is_not_zero
958
// = size.num.bits() + -bsr(src) - is_not_zero.
959
masm.asm.bsr(src, dst, size);
960
masm.asm.setcc(IntCmpKind::Ne, scratch.writable());
961
masm.asm.neg(dst.to_reg(), dst, size);
962
masm.asm.add_ir(size.num_bits() as i32, dst, size);
963
masm.asm.sub_rr(scratch.inner(), dst, size);
964
});
965
}
966
967
Ok(())
968
}
969
970
fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
971
if self.flags.has_bmi1() {
972
self.asm.tzcnt(src, dst, size);
973
} else {
974
self.with_scratch::<IntScratch, _>(|masm, scratch| {
975
// Use the following approach:
976
// dst = bsf(src) + (is_zero * size.num_bits())
977
// = bsf(src) + (is_zero << size.log2()).
978
// BSF outputs the correct value for every value except 0.
979
// When the value is 0, BSF outputs 0, correct output for ctz is
980
// the number of bits.
981
masm.asm.bsf(src, dst, size);
982
masm.asm.setcc(IntCmpKind::Eq, scratch.writable());
983
masm.asm
984
.shift_ir(size.log2(), scratch.writable(), ShiftKind::Shl, size);
985
masm.asm.add_rr(scratch.inner(), dst, size);
986
});
987
}
988
989
Ok(())
990
}
991
992
fn get_label(&mut self) -> Result<MachLabel> {
993
let buffer = self.asm.buffer_mut();
994
Ok(buffer.get_label())
995
}
996
997
fn bind(&mut self, label: MachLabel) -> Result<()> {
998
let buffer = self.asm.buffer_mut();
999
buffer.bind_label(label, &mut Default::default());
1000
Ok(())
1001
}
1002
1003
fn branch(
1004
&mut self,
1005
kind: IntCmpKind,
1006
lhs: Reg,
1007
rhs: RegImm,
1008
taken: MachLabel,
1009
size: OperandSize,
1010
) -> Result<()> {
1011
use IntCmpKind::*;
1012
1013
match &(lhs, rhs) {
1014
(rlhs, RegImm::Reg(rrhs)) => {
1015
// If the comparison kind is zero or not zero and both operands
1016
// are the same register, emit a test instruction. Else we emit
1017
// a normal comparison.
1018
if (kind == Eq || kind == Ne) && (rlhs == rrhs) {
1019
self.asm.test_rr(*rlhs, *rrhs, size);
1020
} else {
1021
self.cmp(lhs, rhs, size)?;
1022
}
1023
}
1024
_ => self.cmp(lhs, rhs, size)?,
1025
}
1026
self.asm.jmp_if(kind, taken);
1027
Ok(())
1028
}
1029
1030
fn jmp(&mut self, target: MachLabel) -> Result<()> {
1031
self.asm.jmp(target);
1032
Ok(())
1033
}
1034
1035
fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {
1036
let src = context.pop_to_reg(self, None)?;
1037
if self.flags.has_popcnt() && self.flags.has_sse42() {
1038
self.asm.popcnt(src.into(), writable!(src.into()), size);
1039
context.stack.push(src.into());
1040
Ok(())
1041
} else {
1042
// The fallback functionality here is based on `MacroAssembler::popcnt64` in:
1043
// https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#495
1044
1045
let tmp = writable!(context.any_gpr(self)?);
1046
let dst = writable!(src.into());
1047
let (masks, shift_amt) = match size {
1048
OperandSize::S64 => (
1049
[
1050
0x5555555555555555, // m1
1051
0x3333333333333333, // m2
1052
0x0f0f0f0f0f0f0f0f, // m4
1053
0x0101010101010101, // h01
1054
],
1055
56u8,
1056
),
1057
// 32-bit popcount is the same, except the masks are half as
1058
// wide and we shift by 24 at the end rather than 56
1059
OperandSize::S32 => (
1060
[0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],
1061
24u8,
1062
),
1063
_ => bail!(CodeGenError::unexpected_operand_size()),
1064
};
1065
self.asm.mov_rr(src.into(), tmp, size);
1066
1067
// x -= (x >> 1) & m1;
1068
self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);
1069
let lhs = dst.to_reg();
1070
self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;
1071
self.asm.sub_rr(dst.to_reg(), tmp, size);
1072
1073
// x = (x & m2) + ((x >> 2) & m2);
1074
self.asm.mov_rr(tmp.to_reg(), dst, size);
1075
// Load `0x3333...` into the scratch reg once, allowing us to use
1076
// `and_rr` and avoid inadvertently loading it twice as with `and`
1077
1078
self.with_scratch::<IntScratch, _>(|masm, scratch| {
1079
masm.load_constant(&I::i64(masks[1]), scratch.writable(), size)?;
1080
masm.asm.and_rr(scratch.inner(), dst, size);
1081
masm.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);
1082
masm.asm.and_rr(scratch.inner(), tmp, size);
1083
anyhow::Ok(())
1084
})?;
1085
self.asm.add_rr(dst.to_reg(), tmp, size);
1086
1087
// x = (x + (x >> 4)) & m4;
1088
self.asm.mov_rr(tmp.to_reg(), dst, size);
1089
self.asm.shift_ir(4u8, dst, ShiftKind::ShrU, size);
1090
self.asm.add_rr(tmp.to_reg(), dst, size);
1091
let lhs = dst.to_reg();
1092
self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;
1093
1094
// (x * h01) >> shift_amt
1095
let lhs = dst.to_reg();
1096
self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;
1097
self.asm.shift_ir(shift_amt, dst, ShiftKind::ShrU, size);
1098
1099
context.stack.push(src.into());
1100
context.free_reg(tmp.to_reg());
1101
1102
Ok(())
1103
}
1104
}
1105
1106
fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1107
self.asm.mov_rr(src, dst, OperandSize::S32);
1108
Ok(())
1109
}
1110
1111
fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {
1112
match kind {
1113
ExtendKind::Signed(ext) => {
1114
self.asm.movsx_rr(src, dst, ext);
1115
}
1116
ExtendKind::Unsigned(ext) => {
1117
self.asm.movzx_rr(src, dst, ext);
1118
}
1119
}
1120
1121
Ok(())
1122
}
1123
1124
fn signed_truncate(
1125
&mut self,
1126
dst: WritableReg,
1127
src: Reg,
1128
src_size: OperandSize,
1129
dst_size: OperandSize,
1130
kind: TruncKind,
1131
) -> Result<()> {
1132
self.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1133
masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1134
masm.asm.cvt_float_to_sint_seq(
1135
src,
1136
dst,
1137
gpr_scratch.inner(),
1138
xmm_scratch.inner(),
1139
src_size,
1140
dst_size,
1141
kind.is_checked(),
1142
);
1143
Ok(())
1144
})
1145
})
1146
}
1147
1148
fn unsigned_truncate(
1149
&mut self,
1150
ctx: &mut CodeGenContext<Emission>,
1151
src_size: OperandSize,
1152
dst_size: OperandSize,
1153
kind: TruncKind,
1154
) -> Result<()> {
1155
let dst_ty = match dst_size {
1156
OperandSize::S32 => WasmValType::I32,
1157
OperandSize::S64 => WasmValType::I64,
1158
_ => bail!(CodeGenError::unexpected_operand_size()),
1159
};
1160
1161
ctx.convert_op_with_tmp_reg(
1162
self,
1163
dst_ty,
1164
RegClass::Float,
1165
|masm, dst, src, tmp_fpr, dst_size| {
1166
masm.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1167
masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1168
masm.asm.cvt_float_to_uint_seq(
1169
src,
1170
writable!(dst),
1171
gpr_scratch.inner(),
1172
xmm_scratch.inner(),
1173
tmp_fpr,
1174
src_size,
1175
dst_size,
1176
kind.is_checked(),
1177
);
1178
Ok(())
1179
})
1180
})
1181
},
1182
)
1183
}
1184
1185
fn signed_convert(
1186
&mut self,
1187
dst: WritableReg,
1188
src: Reg,
1189
src_size: OperandSize,
1190
dst_size: OperandSize,
1191
) -> Result<()> {
1192
self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);
1193
Ok(())
1194
}
1195
1196
fn unsigned_convert(
1197
&mut self,
1198
dst: WritableReg,
1199
src: Reg,
1200
tmp_gpr: Reg,
1201
src_size: OperandSize,
1202
dst_size: OperandSize,
1203
) -> Result<()> {
1204
// Need to convert unsigned uint32 to uint64 for conversion instruction sequence.
1205
if let OperandSize::S32 = src_size {
1206
self.extend(
1207
writable!(src),
1208
src,
1209
ExtendKind::Unsigned(Extend::I64Extend32),
1210
)?;
1211
}
1212
1213
self.with_scratch::<IntScratch, _>(|masm, scratch| {
1214
masm.asm
1215
.cvt_uint64_to_float_seq(src, dst, scratch.inner(), tmp_gpr, dst_size);
1216
Ok(())
1217
})
1218
}
1219
1220
fn reinterpret_float_as_int(
1221
&mut self,
1222
dst: WritableReg,
1223
src: Reg,
1224
size: OperandSize,
1225
) -> Result<()> {
1226
self.asm.xmm_to_gpr(src, dst, size);
1227
Ok(())
1228
}
1229
1230
fn reinterpret_int_as_float(
1231
&mut self,
1232
dst: WritableReg,
1233
src: Reg,
1234
size: OperandSize,
1235
) -> Result<()> {
1236
self.asm.gpr_to_xmm(src, dst, size);
1237
Ok(())
1238
}
1239
1240
fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1241
self.asm
1242
.cvt_float_to_float(src, dst, OperandSize::S64, OperandSize::S32);
1243
Ok(())
1244
}
1245
1246
fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1247
self.asm
1248
.cvt_float_to_float(src, dst, OperandSize::S32, OperandSize::S64);
1249
Ok(())
1250
}
1251
1252
fn unreachable(&mut self) -> Result<()> {
1253
self.asm.trap(TRAP_UNREACHABLE);
1254
Ok(())
1255
}
1256
1257
fn trap(&mut self, code: TrapCode) -> Result<()> {
1258
self.asm.trap(code);
1259
Ok(())
1260
}
1261
1262
fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {
1263
self.asm.trapif(cc, code);
1264
Ok(())
1265
}
1266
1267
fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {
1268
self.asm.test_rr(src, src, self.ptr_size);
1269
self.asm.trapif(IntCmpKind::Eq, code);
1270
Ok(())
1271
}
1272
1273
fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {
1274
// At least one default target.
1275
debug_assert!(targets.len() >= 1);
1276
let default_index = targets.len() - 1;
1277
// Emit bounds check, by conditionally moving the max cases
1278
// into the given index reg if the contents of the index reg
1279
// are greater.
1280
let max = default_index;
1281
let size = OperandSize::S32;
1282
self.asm.mov_ir(max as u64, writable!(tmp), size);
1283
self.asm.cmp_rr(tmp, index, size);
1284
self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);
1285
1286
let default = targets[default_index];
1287
let rest = &targets[0..default_index];
1288
1289
self.with_scratch::<IntScratch, _>(|masm, tmp1| {
1290
masm.asm
1291
.jmp_table(rest.into(), default, index, tmp1.inner(), tmp);
1292
Ok(())
1293
})
1294
}
1295
1296
fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {
1297
Ok(self.asm.buffer_mut().start_srcloc(loc))
1298
}
1299
1300
fn end_source_loc(&mut self) -> Result<()> {
1301
self.asm.buffer_mut().end_srcloc();
1302
Ok(())
1303
}
1304
1305
fn current_code_offset(&self) -> Result<CodeOffset> {
1306
Ok(self.asm.buffer().cur_offset())
1307
}
1308
1309
fn add128(
1310
&mut self,
1311
dst_lo: WritableReg,
1312
dst_hi: WritableReg,
1313
lhs_lo: Reg,
1314
lhs_hi: Reg,
1315
rhs_lo: Reg,
1316
rhs_hi: Reg,
1317
) -> Result<()> {
1318
Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1319
Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1320
self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);
1321
self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);
1322
Ok(())
1323
}
1324
1325
fn sub128(
1326
&mut self,
1327
dst_lo: WritableReg,
1328
dst_hi: WritableReg,
1329
lhs_lo: Reg,
1330
lhs_hi: Reg,
1331
rhs_lo: Reg,
1332
rhs_hi: Reg,
1333
) -> Result<()> {
1334
Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1335
Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1336
self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);
1337
self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);
1338
Ok(())
1339
}
1340
1341
fn mul_wide(
1342
&mut self,
1343
context: &mut CodeGenContext<Emission>,
1344
kind: MulWideKind,
1345
) -> Result<()> {
1346
// Reserve rax/rdx since they're required by the `mul_wide` instruction
1347
// being used here.
1348
let rax = context.reg(regs::rax(), self)?;
1349
let rdx = context.reg(regs::rdx(), self)?;
1350
1351
// The rhs of this binop can be in any register
1352
let rhs = context.pop_to_reg(self, None)?;
1353
// Mark rax as allocatable. and then force the lhs operand to be placed
1354
// in `rax`.
1355
context.free_reg(rax);
1356
let lhs = context.pop_to_reg(self, Some(rax))?;
1357
1358
self.asm.mul_wide(
1359
writable!(rax),
1360
writable!(rdx),
1361
lhs.reg,
1362
rhs.reg,
1363
kind,
1364
OperandSize::S64,
1365
);
1366
1367
// No longer using the rhs register after the multiplication has been
1368
// executed.
1369
context.free_reg(rhs);
1370
1371
// The low bits of the result are in rax, where `lhs` was allocated to
1372
context.stack.push(lhs.into());
1373
// The high bits of the result are in rdx, which we previously reserved.
1374
context.stack.push(Val::Reg(TypedReg::i64(rdx)));
1375
1376
Ok(())
1377
}
1378
1379
fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {
1380
// Get the source and destination operands set up first.
1381
let (src, dst) = match size {
1382
// Floats can use the same register for `src` and `dst`.
1383
SplatKind::F32x4 | SplatKind::F64x2 => {
1384
let reg = context.pop_to_reg(self, None)?.reg;
1385
(RegImm::reg(reg), writable!(reg))
1386
}
1387
// For ints, we need to load the operand into a vector register if
1388
// it's not a constant.
1389
SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {
1390
let dst = writable!(context.any_fpr(self)?);
1391
let src = if size == SplatKind::I64x2 {
1392
context.pop_i64_const().map(RegImm::i64)
1393
} else {
1394
context.pop_i32_const().map(RegImm::i32)
1395
}
1396
.map_or_else(
1397
|| -> Result<RegImm> {
1398
let reg = context.pop_to_reg(self, None)?.reg;
1399
self.reinterpret_int_as_float(
1400
dst,
1401
reg,
1402
match size {
1403
SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {
1404
OperandSize::S32
1405
}
1406
SplatKind::I64x2 => OperandSize::S64,
1407
SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),
1408
},
1409
)?;
1410
context.free_reg(reg);
1411
Ok(RegImm::Reg(dst.to_reg()))
1412
},
1413
Ok,
1414
)?;
1415
(src, dst)
1416
}
1417
};
1418
1419
// Perform the splat on the operands.
1420
if size == SplatKind::I64x2 || size == SplatKind::F64x2 {
1421
self.ensure_has_avx()?;
1422
let mask = Self::vpshuf_mask_for_64_bit_splats();
1423
match src {
1424
RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),
1425
RegImm::Imm(imm) => {
1426
let src = self.asm.add_constant(&imm.to_bytes());
1427
self.asm
1428
.xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());
1429
}
1430
}
1431
} else {
1432
self.ensure_has_avx2()?;
1433
1434
match src {
1435
RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),
1436
RegImm::Imm(imm) => {
1437
let src = self.asm.add_constant(&imm.to_bytes());
1438
self.asm
1439
.xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());
1440
}
1441
}
1442
}
1443
1444
context
1445
.stack
1446
.push(Val::reg(dst.to_reg(), WasmValType::V128));
1447
Ok(())
1448
}
1449
1450
fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {
1451
self.ensure_has_avx()?;
1452
1453
// Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`
1454
// separately to either the selected index or 0.
1455
// Then use `vpor` to combine `lhs` and `rhs` into `dst`.
1456
// Setting the most significant bit in the mask's lane to 1 will
1457
// result in corresponding lane in the destination register being
1458
// set to 0. 0x80 sets the most significant bit to 1.
1459
let mut mask_lhs: [u8; 16] = [0x80; 16];
1460
let mut mask_rhs: [u8; 16] = [0x80; 16];
1461
for i in 0..lanes.len() {
1462
if lanes[i] < 16 {
1463
mask_lhs[i] = lanes[i];
1464
} else {
1465
mask_rhs[i] = lanes[i] - 16;
1466
}
1467
}
1468
let mask_lhs = self.asm.add_constant(&mask_lhs);
1469
let mask_rhs = self.asm.add_constant(&mask_rhs);
1470
1471
self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);
1472
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1473
masm.asm.xmm_vpshufb_rrm(scratch.writable(), rhs, &mask_rhs);
1474
masm.asm.xmm_vpor_rrr(dst, dst.to_reg(), scratch.inner());
1475
Ok(())
1476
})
1477
}
1478
1479
fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {
1480
self.ensure_has_avx()?;
1481
1482
// Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything
1483
// outside that range.
1484
// Each lane is a signed byte so the maximum value is 0x7F. Adding
1485
// 0x70 to any value higher than 0xF will saturate resulting in a value
1486
// of 0xFF (i.e., 0).
1487
let clamp = self.asm.add_constant(&[0x70; 16]);
1488
self.asm
1489
.xmm_vpaddus_rrm(writable!(rhs), rhs, &clamp, OperandSize::S8);
1490
1491
// Don't need to subtract 0x70 since `vpshufb` uses the least
1492
// significant 4 bits which are the same after adding 0x70.
1493
self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);
1494
Ok(())
1495
}
1496
1497
fn atomic_rmw(
1498
&mut self,
1499
context: &mut CodeGenContext<Emission>,
1500
addr: Self::Address,
1501
size: OperandSize,
1502
op: RmwOp,
1503
flags: MemFlags,
1504
extend: Option<Extend<Zero>>,
1505
) -> Result<()> {
1506
let res = match op {
1507
RmwOp::Add => {
1508
let operand = context.pop_to_reg(self, None)?;
1509
self.asm
1510
.lock_xadd(addr, writable!(operand.reg), size, flags);
1511
operand.reg
1512
}
1513
RmwOp::Sub => {
1514
let operand = context.pop_to_reg(self, None)?;
1515
self.asm.neg(operand.reg, writable!(operand.reg), size);
1516
self.asm
1517
.lock_xadd(addr, writable!(operand.reg), size, flags);
1518
operand.reg
1519
}
1520
RmwOp::Xchg => {
1521
let operand = context.pop_to_reg(self, None)?;
1522
self.asm.xchg(addr, writable!(operand.reg), size, flags);
1523
operand.reg
1524
}
1525
RmwOp::And | RmwOp::Or | RmwOp::Xor => {
1526
let op = match op {
1527
RmwOp::And => AtomicRmwSeqOp::And,
1528
RmwOp::Or => AtomicRmwSeqOp::Or,
1529
RmwOp::Xor => AtomicRmwSeqOp::Xor,
1530
_ => unreachable!(
1531
"invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"
1532
),
1533
};
1534
let dst = context.reg(regs::rax(), self)?;
1535
let operand = context.pop_to_reg(self, None)?;
1536
1537
self.with_scratch::<IntScratch, _>(|masm, scratch| {
1538
masm.asm.atomic_rmw_seq(
1539
addr,
1540
operand.reg,
1541
writable!(dst),
1542
scratch.writable(),
1543
size,
1544
flags,
1545
op,
1546
);
1547
});
1548
1549
context.free_reg(operand.reg);
1550
dst
1551
}
1552
};
1553
1554
let dst_ty = match extend {
1555
Some(ext) => {
1556
// We don't need to zero-extend from 32 to 64bits.
1557
if !(ext.from_bits() == 32 && ext.to_bits() == 64) {
1558
self.asm.movzx_rr(res, writable!(res), ext);
1559
}
1560
1561
WasmValType::int_from_bits(ext.to_bits())
1562
}
1563
None => WasmValType::int_from_bits(size.num_bits()),
1564
};
1565
1566
context.stack.push(TypedReg::new(dst_ty, res).into());
1567
1568
Ok(())
1569
}
1570
1571
fn extract_lane(
1572
&mut self,
1573
src: Reg,
1574
dst: WritableReg,
1575
lane: u8,
1576
kind: ExtractLaneKind,
1577
) -> Result<()> {
1578
self.ensure_has_avx()?;
1579
1580
match kind {
1581
ExtractLaneKind::I8x16S
1582
| ExtractLaneKind::I8x16U
1583
| ExtractLaneKind::I16x8S
1584
| ExtractLaneKind::I16x8U
1585
| ExtractLaneKind::I32x4
1586
| ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),
1587
ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {
1588
// If the `src` and `dst` registers are the same, then the
1589
// appropriate value is already in the correct position in
1590
// the register.
1591
assert!(src == dst.to_reg());
1592
}
1593
ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),
1594
ExtractLaneKind::F64x2 => {
1595
// `0b11_10` selects the high and low 32-bits of the second
1596
// 64-bit, so `0b11_10_11_10` splats the 64-bit value across
1597
// both lanes. Since we put an `f64` on the stack, we use
1598
// the splatted value.
1599
// Double-check `lane == 0` was handled in another branch.
1600
assert!(lane == 1);
1601
self.asm
1602
.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)
1603
}
1604
}
1605
1606
// Sign-extend to 32-bits for sign extended kinds.
1607
match kind {
1608
ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {
1609
self.asm.movsx_rr(dst.to_reg(), dst, kind.into())
1610
}
1611
_ => (),
1612
}
1613
1614
Ok(())
1615
}
1616
1617
fn replace_lane(
1618
&mut self,
1619
src: RegImm,
1620
dst: WritableReg,
1621
lane: u8,
1622
kind: ReplaceLaneKind,
1623
) -> Result<()> {
1624
self.ensure_has_avx()?;
1625
1626
match kind {
1627
ReplaceLaneKind::I8x16
1628
| ReplaceLaneKind::I16x8
1629
| ReplaceLaneKind::I32x4
1630
| ReplaceLaneKind::I64x2 => match src {
1631
RegImm::Reg(reg) => {
1632
self.asm
1633
.xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());
1634
}
1635
RegImm::Imm(imm) => {
1636
let address = self.asm.add_constant(&imm.to_bytes());
1637
self.asm
1638
.xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());
1639
}
1640
},
1641
ReplaceLaneKind::F32x4 => {
1642
// Immediate for `vinsertps` uses first 3 bits to determine
1643
// which elements of the destination to set to 0. The next 2
1644
// bits specify which element of the destination will be
1645
// overwritten.
1646
let imm = lane << 4;
1647
match src {
1648
RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),
1649
RegImm::Imm(val) => {
1650
let address = self.asm.add_constant(&val.to_bytes());
1651
self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);
1652
}
1653
}
1654
}
1655
ReplaceLaneKind::F64x2 => match src {
1656
RegImm::Reg(reg) => match lane {
1657
0 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),
1658
1 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),
1659
_ => unreachable!(),
1660
},
1661
RegImm::Imm(imm) => {
1662
let address = self.asm.add_constant(&imm.to_bytes());
1663
match lane {
1664
0 => {
1665
// Memory load variant of `vmovsd` zeroes the upper
1666
// 64 bits of the register so need to load the
1667
// immediate to a register to use the register
1668
// variant of `vmovsd` to perform the merge.
1669
1670
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1671
masm.asm.xmm_vmovsd_rm(scratch.writable(), &address);
1672
masm.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.inner());
1673
});
1674
}
1675
1 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),
1676
_ => unreachable!(),
1677
}
1678
}
1679
},
1680
}
1681
Ok(())
1682
}
1683
1684
fn atomic_cas(
1685
&mut self,
1686
context: &mut CodeGenContext<Emission>,
1687
addr: Self::Address,
1688
size: OperandSize,
1689
flags: MemFlags,
1690
extend: Option<Extend<Zero>>,
1691
) -> Result<()> {
1692
// `cmpxchg` expects `expected` to be in the `*a*` register.
1693
// reserve rax for the expected argument.
1694
let rax = context.reg(regs::rax(), self)?;
1695
1696
let replacement = context.pop_to_reg(self, None)?;
1697
1698
// mark `rax` as allocatable again.
1699
context.free_reg(rax);
1700
let expected = context.pop_to_reg(self, Some(regs::rax()))?;
1701
1702
self.asm
1703
.cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags);
1704
1705
if let Some(extend) = extend {
1706
// We don't need to zero-extend from 32 to 64bits.
1707
if !(extend.from_bits() == 32 && extend.to_bits() == 64) {
1708
self.asm
1709
.movzx_rr(expected.reg, writable!(expected.reg), extend);
1710
}
1711
}
1712
1713
context.stack.push(expected.into());
1714
context.free_reg(replacement);
1715
1716
Ok(())
1717
}
1718
1719
fn v128_eq(
1720
&mut self,
1721
dst: WritableReg,
1722
lhs: Reg,
1723
rhs: Reg,
1724
kind: VectorEqualityKind,
1725
) -> Result<()> {
1726
self.ensure_has_avx()?;
1727
1728
match kind {
1729
VectorEqualityKind::I8x16
1730
| VectorEqualityKind::I16x8
1731
| VectorEqualityKind::I32x4
1732
| VectorEqualityKind::I64x2 => {
1733
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())
1734
}
1735
VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1736
self.asm
1737
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)
1738
}
1739
}
1740
Ok(())
1741
}
1742
1743
fn v128_ne(
1744
&mut self,
1745
dst: WritableReg,
1746
lhs: Reg,
1747
rhs: Reg,
1748
kind: VectorEqualityKind,
1749
) -> Result<()> {
1750
self.ensure_has_avx()?;
1751
1752
match kind {
1753
VectorEqualityKind::I8x16
1754
| VectorEqualityKind::I16x8
1755
| VectorEqualityKind::I32x4
1756
| VectorEqualityKind::I64x2 => {
1757
// Check for equality and invert the results.
1758
self.asm
1759
.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1760
self.asm
1761
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1762
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1763
}
1764
VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1765
self.asm
1766
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)
1767
}
1768
}
1769
Ok(())
1770
}
1771
1772
fn v128_lt(
1773
&mut self,
1774
dst: WritableReg,
1775
lhs: Reg,
1776
rhs: Reg,
1777
kind: VectorCompareKind,
1778
) -> Result<()> {
1779
self.ensure_has_avx()?;
1780
1781
match kind {
1782
VectorCompareKind::I8x16S
1783
| VectorCompareKind::I16x8S
1784
| VectorCompareKind::I32x4S
1785
| VectorCompareKind::I64x2S => {
1786
// Perform a greater than check with reversed parameters.
1787
self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())
1788
}
1789
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1790
// Set `lhs` to min values, check for equality, then invert the
1791
// result.
1792
// If `lhs` is smaller, then equality check will fail and result
1793
// will be inverted to true. Otherwise the equality check will
1794
// pass and be inverted to false.
1795
self.asm
1796
.xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1797
self.asm
1798
.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1799
self.asm
1800
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1801
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1802
}
1803
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1804
self.asm
1805
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)
1806
}
1807
}
1808
Ok(())
1809
}
1810
1811
fn v128_le(
1812
&mut self,
1813
dst: WritableReg,
1814
lhs: Reg,
1815
rhs: Reg,
1816
kind: VectorCompareKind,
1817
) -> Result<()> {
1818
self.ensure_has_avx()?;
1819
1820
match kind {
1821
VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1822
// Set the `rhs` vector to the signed minimum values and then
1823
// compare them with `lhs` for equality.
1824
self.asm
1825
.xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1826
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1827
}
1828
VectorCompareKind::I64x2S => {
1829
// Do a greater than check and invert the results.
1830
self.asm
1831
.xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1832
self.asm
1833
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1834
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1835
}
1836
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1837
// Set the `rhs` vector to the signed minimum values and then
1838
// compare them with `lhs` for equality.
1839
self.asm
1840
.xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1841
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1842
}
1843
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1844
self.asm
1845
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)
1846
}
1847
}
1848
Ok(())
1849
}
1850
1851
fn v128_gt(
1852
&mut self,
1853
dst: WritableReg,
1854
lhs: Reg,
1855
rhs: Reg,
1856
kind: VectorCompareKind,
1857
) -> Result<()> {
1858
self.ensure_has_avx()?;
1859
1860
match kind {
1861
VectorCompareKind::I8x16S
1862
| VectorCompareKind::I16x8S
1863
| VectorCompareKind::I32x4S
1864
| VectorCompareKind::I64x2S => {
1865
self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())
1866
}
1867
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1868
// Set `lhs` to max values, check for equality, then invert the
1869
// result.
1870
// If `lhs` is larger, then equality check will fail and result
1871
// will be inverted to true. Otherwise the equality check will
1872
// pass and be inverted to false.
1873
self.asm
1874
.xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1875
self.asm
1876
.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1877
self.asm
1878
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1879
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1880
}
1881
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1882
// Do a less than comparison with the operands swapped.
1883
self.asm
1884
.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)
1885
}
1886
}
1887
Ok(())
1888
}
1889
1890
fn v128_ge(
1891
&mut self,
1892
dst: WritableReg,
1893
lhs: Reg,
1894
rhs: Reg,
1895
kind: VectorCompareKind,
1896
) -> Result<()> {
1897
self.ensure_has_avx()?;
1898
1899
match kind {
1900
VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1901
// Set each lane to maximum value and then compare for equality.
1902
self.asm
1903
.xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1904
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1905
}
1906
VectorCompareKind::I64x2S => {
1907
// Perform a greater than comparison with operands swapped,
1908
// then invert the results.
1909
self.asm
1910
.xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());
1911
self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());
1912
self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
1913
}
1914
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1915
// Set lanes to maximum values and compare them for equality.
1916
self.asm
1917
.xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1918
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1919
}
1920
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1921
// Perform a less than or equal comparison on swapped operands.
1922
self.asm
1923
.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)
1924
}
1925
}
1926
1927
Ok(())
1928
}
1929
1930
fn fence(&mut self) -> Result<()> {
1931
self.asm.mfence();
1932
Ok(())
1933
}
1934
1935
fn v128_not(&mut self, dst: WritableReg) -> Result<()> {
1936
self.ensure_has_avx()?;
1937
1938
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
1939
// First, we initialize `tmp` with all ones by comparing it with
1940
// itself.
1941
masm.asm
1942
.xmm_vpcmpeq_rrr(tmp.writable(), tmp.inner(), tmp.inner(), OperandSize::S32);
1943
// Then we `xor` tmp and `dst` together, yielding `!dst`.
1944
masm.asm.xmm_vpxor_rrr(tmp.inner(), dst.to_reg(), dst);
1945
Ok(())
1946
})
1947
}
1948
1949
fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1950
self.ensure_has_avx()?;
1951
self.asm.xmm_vpand_rrr(src1, src2, dst);
1952
Ok(())
1953
}
1954
1955
fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1956
self.ensure_has_avx()?;
1957
self.asm.xmm_vpandn_rrr(src1, src2, dst);
1958
Ok(())
1959
}
1960
1961
fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1962
self.ensure_has_avx()?;
1963
self.asm.xmm_vpor_rrr(dst, src1, src2);
1964
Ok(())
1965
}
1966
1967
fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1968
self.ensure_has_avx()?;
1969
self.asm.xmm_vpxor_rrr(src1, src2, dst);
1970
Ok(())
1971
}
1972
1973
fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {
1974
self.ensure_has_avx()?;
1975
1976
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
1977
masm.v128_and(src1, mask, tmp.writable())?;
1978
masm.v128_and_not(mask, src2, dst)?;
1979
masm.v128_or(dst.to_reg(), tmp.inner(), dst)?;
1980
Ok(())
1981
})
1982
}
1983
1984
fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
1985
self.ensure_has_avx()?;
1986
self.asm.xmm_vptest(src, src);
1987
self.asm.setcc(IntCmpKind::Ne, dst);
1988
Ok(())
1989
}
1990
1991
fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {
1992
self.ensure_has_avx()?;
1993
match kind {
1994
V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),
1995
V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),
1996
V128ConvertKind::I32x4U => {
1997
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1998
// Split each 32-bit integer into 16-bit parts.
1999
// `scratch` will contain the low bits and `dst` will contain
2000
// the high bits.
2001
masm.asm
2002
.xmm_vpsll_rri(src, scratch.writable(), 0x10, kind.src_lane_size());
2003
masm.asm.xmm_vpsrl_rri(
2004
scratch.inner(),
2005
scratch.writable(),
2006
0x10,
2007
kind.src_lane_size(),
2008
);
2009
masm.asm
2010
.xmm_vpsub_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2011
2012
// Convert the low bits in `scratch` to floating point numbers.
2013
masm.asm
2014
.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
2015
2016
// Prevent overflow by right shifting high bits.
2017
masm.asm
2018
.xmm_vpsrl_rri(dst.to_reg(), dst, 1, kind.src_lane_size());
2019
// Convert high bits in `dst` to floating point numbers.
2020
masm.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);
2021
// Double high bits in `dst` to reverse right shift.
2022
masm.asm
2023
.xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());
2024
// Add high bits in `dst` to low bits in `scratch`.
2025
masm.asm.xmm_vaddp_rrr(
2026
dst.to_reg(),
2027
scratch.inner(),
2028
dst,
2029
kind.src_lane_size(),
2030
);
2031
});
2032
}
2033
V128ConvertKind::I32x4LowU => {
2034
// See
2035
// https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L3668
2036
// for details on the Cranelift AVX implementation.
2037
// Use `vunpcklp` to create doubles from the integers.
2038
// Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers
2039
// creates a byte array for a double that sets the mantissa
2040
// bits to the original integer value.
2041
let conversion_constant = self
2042
.asm
2043
.add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);
2044
self.asm
2045
.xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());
2046
// Subtract the 0x1.0p52 added above.
2047
let conversion_constant = self.asm.add_constant(&[
2048
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
2049
0x00, 0x30, 0x43,
2050
]);
2051
self.asm.xmm_vsub_rrm(
2052
dst.to_reg(),
2053
&conversion_constant,
2054
dst,
2055
kind.dst_lane_size(),
2056
);
2057
}
2058
}
2059
Ok(())
2060
}
2061
2062
fn v128_narrow(
2063
&mut self,
2064
src1: Reg,
2065
src2: Reg,
2066
dst: WritableReg,
2067
kind: V128NarrowKind,
2068
) -> Result<()> {
2069
self.ensure_has_avx()?;
2070
match kind {
2071
V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {
2072
self.asm
2073
.xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())
2074
}
2075
V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {
2076
self.asm
2077
.xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())
2078
}
2079
}
2080
Ok(())
2081
}
2082
2083
fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2084
self.ensure_has_avx()?;
2085
self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);
2086
Ok(())
2087
}
2088
2089
fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2090
self.ensure_has_avx()?;
2091
self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);
2092
Ok(())
2093
}
2094
2095
fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {
2096
self.ensure_has_avx()?;
2097
match kind {
2098
V128ExtendKind::LowI8x16S
2099
| V128ExtendKind::LowI8x16U
2100
| V128ExtendKind::LowI16x8S
2101
| V128ExtendKind::LowI16x8U
2102
| V128ExtendKind::LowI32x4S
2103
| V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),
2104
V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {
2105
self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);
2106
self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2107
}
2108
V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {
2109
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2110
masm.asm
2111
.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2112
masm.asm
2113
.xmm_vpunpckh_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2114
});
2115
}
2116
V128ExtendKind::HighI32x4S => {
2117
// Move the 3rd element (i.e., 0b10) to the 1st (rightmost)
2118
// position and the 4th element (i.e., 0b11) to the 2nd (second
2119
// from the right) position and then perform the extend.
2120
self.asm
2121
.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());
2122
self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2123
}
2124
V128ExtendKind::HighI32x4U => {
2125
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2126
// Set `scratch` to a vector 0s.
2127
masm.asm.xmm_vxorp_rrr(
2128
scratch.inner(),
2129
scratch.inner(),
2130
scratch.writable(),
2131
kind.src_lane_size(),
2132
);
2133
// Interleave the 0 bits into the two 32-bit integers to zero extend them.
2134
masm.asm
2135
.xmm_vunpckhp_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2136
});
2137
}
2138
}
2139
Ok(())
2140
}
2141
2142
fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {
2143
self.ensure_has_avx()?;
2144
match kind {
2145
V128AddKind::F32x4 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S32),
2146
V128AddKind::F64x2 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S64),
2147
V128AddKind::I8x16 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S8),
2148
V128AddKind::I8x16SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S8),
2149
V128AddKind::I8x16SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S8),
2150
V128AddKind::I16x8 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S16),
2151
V128AddKind::I16x8SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S16),
2152
V128AddKind::I16x8SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S16),
2153
V128AddKind::I32x4 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S32),
2154
V128AddKind::I64x2 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S64),
2155
};
2156
Ok(())
2157
}
2158
2159
fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {
2160
self.ensure_has_avx()?;
2161
match kind {
2162
V128SubKind::F32x4 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S32),
2163
V128SubKind::F64x2 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S64),
2164
V128SubKind::I8x16 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S8),
2165
V128SubKind::I8x16SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S8),
2166
V128SubKind::I8x16SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S8),
2167
V128SubKind::I16x8 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S16),
2168
V128SubKind::I16x8SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S16),
2169
V128SubKind::I16x8SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S16),
2170
V128SubKind::I32x4 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S32),
2171
V128SubKind::I64x2 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S64),
2172
};
2173
Ok(())
2174
}
2175
2176
fn v128_mul(
2177
&mut self,
2178
context: &mut CodeGenContext<Emission>,
2179
kind: V128MulKind,
2180
) -> Result<()> {
2181
self.ensure_has_avx()?;
2182
2183
let rhs = context.pop_to_reg(self, None)?;
2184
let lhs = context.pop_to_reg(self, None)?;
2185
2186
let mul_i64x2_avx512 = |this: &mut Self| {
2187
this.asm.vpmullq(lhs.reg, rhs.reg, writable!(lhs.reg));
2188
};
2189
2190
let mul_i64x2_fallback = |this: &mut Self,
2191
context: &mut CodeGenContext<Emission>|
2192
-> Result<()> {
2193
// Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback
2194
// to an instruction sequence using 32bits multiplication (taken from cranelift
2195
// implementation, in `isa/x64/lower.isle`):
2196
//
2197
// > Otherwise, for i64x2 multiplication we describe a lane A as being composed of
2198
// > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
2199
// > multiplication can then be written as:
2200
//
2201
// > Ah Al
2202
// > * Bh Bl
2203
// > -----
2204
// > Al * Bl
2205
// > + (Ah * Bl) << 32
2206
// > + (Al * Bh) << 32
2207
//
2208
// > So for each lane we will compute:
2209
//
2210
// > A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
2211
//
2212
// > Note, the algorithm will use `pmuludq` which operates directly on the lower
2213
// > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
2214
// > the lane of the destination. For this reason we don't need shifts to isolate
2215
// > the lower 32-bits, however, we will need to use shifts to isolate the high
2216
// > 32-bits when doing calculations, i.e., `Ah == A >> 32`.
2217
2218
let tmp2 = context.any_fpr(this)?;
2219
this.with_scratch::<FloatScratch, _>(|this, tmp1| {
2220
// tmp1 = lhs_hi = (lhs >> 32)
2221
this.asm
2222
.xmm_vpsrl_rri(lhs.reg, tmp1.writable(), 32, OperandSize::S64);
2223
2224
// tmp2 = lhs_hi * rhs_low = tmp1 * rhs
2225
this.asm
2226
.xmm_vpmuldq_rrr(tmp1.inner(), rhs.reg, writable!(tmp2));
2227
2228
// tmp1 = rhs_hi = rhs >> 32
2229
this.asm
2230
.xmm_vpsrl_rri(rhs.reg, tmp1.writable(), 32, OperandSize::S64);
2231
2232
// tmp1 = lhs_low * rhs_high = tmp1 * lhs
2233
this.asm
2234
.xmm_vpmuludq_rrr(tmp1.inner(), lhs.reg, tmp1.writable());
2235
2236
// tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2237
this.asm
2238
.xmm_vpadd_rrr(tmp1.inner(), tmp2, tmp1.writable(), OperandSize::S64);
2239
2240
//tmp1 = tmp1 << 32
2241
this.asm
2242
.xmm_vpsll_rri(tmp1.inner(), tmp1.writable(), 32, OperandSize::S64);
2243
2244
// tmp2 = lhs_lo + rhs_lo
2245
this.asm.xmm_vpmuludq_rrr(lhs.reg, rhs.reg, writable!(tmp2));
2246
2247
// finally, with `lhs` as destination:
2248
// lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2249
this.asm
2250
.xmm_vpadd_rrr(tmp1.inner(), tmp2, writable!(lhs.reg), OperandSize::S64);
2251
});
2252
2253
context.free_reg(tmp2);
2254
2255
Ok(())
2256
};
2257
2258
match kind {
2259
V128MulKind::F32x4 => {
2260
self.asm
2261
.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2262
}
2263
V128MulKind::F64x2 => {
2264
self.asm
2265
.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S64)
2266
}
2267
V128MulKind::I16x8 => {
2268
self.asm
2269
.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S16)
2270
}
2271
V128MulKind::I32x4 => {
2272
self.asm
2273
.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2274
}
2275
// This is the fast path when AVX512 is available.
2276
V128MulKind::I64x2
2277
if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>
2278
{
2279
mul_i64x2_avx512(self)
2280
}
2281
// Otherwise, we emit AVX fallback sequence.
2282
V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,
2283
}
2284
2285
context.stack.push(lhs.into());
2286
context.free_reg(rhs);
2287
2288
Ok(())
2289
}
2290
2291
fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {
2292
self.ensure_has_avx()?;
2293
2294
match kind {
2295
V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {
2296
self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())
2297
}
2298
V128AbsKind::I64x2 => {
2299
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2300
// Perform an arithmetic right shift of 31 bits. If the number
2301
// is positive, this will result in all zeroes in the upper
2302
// 32-bits. If the number is negative, this will result in all
2303
// ones in the upper 32-bits.
2304
masm.asm
2305
.xmm_vpsra_rri(src, scratch.writable(), 0x1f, OperandSize::S32);
2306
// Copy the ones and zeroes in the high bits of each 64-bit
2307
// lane to the low bits of each 64-bit lane.
2308
masm.asm.xmm_vpshuf_rr(
2309
scratch.inner(),
2310
scratch.writable(),
2311
0b11_11_01_01,
2312
OperandSize::S32,
2313
);
2314
// Flip the bits in lanes that were negative in `src` and leave
2315
// the positive lanes as they are. Positive lanes will have a
2316
// zero mask in `scratch` so xor doesn't affect them.
2317
masm.asm.xmm_vpxor_rrr(src, scratch.inner(), dst);
2318
// Subtract the mask from the results of xor which will
2319
// complete the two's complement for lanes which were negative.
2320
masm.asm
2321
.xmm_vpsub_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2322
});
2323
}
2324
V128AbsKind::F32x4 | V128AbsKind::F64x2 => {
2325
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2326
// Create a mask of all ones.
2327
masm.asm.xmm_vpcmpeq_rrr(
2328
scratch.writable(),
2329
scratch.inner(),
2330
scratch.inner(),
2331
kind.lane_size(),
2332
);
2333
// Right shift the mask so each lane is a single zero followed
2334
// by all ones.
2335
masm.asm.xmm_vpsrl_rri(
2336
scratch.inner(),
2337
scratch.writable(),
2338
0x1,
2339
kind.lane_size(),
2340
);
2341
// Use the mask to zero the sign bit in each lane which will
2342
// make the float value positive.
2343
masm.asm
2344
.xmm_vandp_rrr(src, scratch.inner(), dst, kind.lane_size());
2345
});
2346
}
2347
}
2348
Ok(())
2349
}
2350
2351
fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {
2352
self.ensure_has_avx()?;
2353
2354
match kind {
2355
V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {
2356
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2357
masm.v128_xor(tmp.inner(), tmp.inner(), tmp.writable())?;
2358
masm.v128_sub(tmp.inner(), op.to_reg(), op, kind.into())?;
2359
anyhow::Ok(())
2360
})?;
2361
}
2362
V128NegKind::F32x4 | V128NegKind::F64x2 => {
2363
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2364
// Create a mask of all 1s.
2365
masm.asm.xmm_vpcmpeq_rrr(
2366
tmp.writable(),
2367
tmp.inner(),
2368
tmp.inner(),
2369
kind.lane_size(),
2370
);
2371
// Left shift the lanes in the mask so only the sign bit in the
2372
// mask is set to 1.
2373
masm.asm.xmm_vpsll_rri(
2374
tmp.inner(),
2375
tmp.writable(),
2376
(kind.lane_size().num_bits() - 1) as u32,
2377
kind.lane_size(),
2378
);
2379
// Use the mask to flip the sign bit.
2380
masm.asm
2381
.xmm_vxorp_rrr(op.to_reg(), tmp.inner(), op, kind.lane_size());
2382
});
2383
}
2384
}
2385
Ok(())
2386
}
2387
2388
fn v128_shift(
2389
&mut self,
2390
context: &mut CodeGenContext<Emission>,
2391
lane_width: OperandSize,
2392
kind: ShiftKind,
2393
) -> Result<()> {
2394
self.ensure_has_avx()?;
2395
let shift_amount = context.pop_to_reg(self, None)?.reg;
2396
let operand = context.pop_to_reg(self, None)?.reg;
2397
let amount_mask = lane_width.num_bits() - 1;
2398
2399
self.and(
2400
writable!(shift_amount),
2401
shift_amount,
2402
RegImm::i32(amount_mask as i32),
2403
OperandSize::S32,
2404
)?;
2405
2406
self.with_scratch::<IntScratch, _>(|masm, tmp| {
2407
masm.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2408
let move_to_tmp_xmm = |this: &mut Self| {
2409
this.asm
2410
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2411
};
2412
2413
// A helper for deciding between `vpsllw` and `vpsrlw` in
2414
// `shift_i8x16`.
2415
enum Direction {
2416
Left,
2417
Right,
2418
}
2419
2420
let shift_i8x16 = |this: &mut Self, masks: &'static [u8], direction: Direction| {
2421
// The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit
2422
// shift instruction. Instead, we shift as 16bits, and then mask the bits in the
2423
// 8bits lane, for example (with 2 8bits lanes):
2424
// - Before shifting:
2425
// 01001101 11101110
2426
// - shifting by 2 left:
2427
// 00110111 10111000
2428
// ^^_ these bits come from the previous byte, and need to be masked.
2429
// - The mask:
2430
// 11111100 11111111
2431
// - After masking:
2432
// 00110100 10111000
2433
//
2434
// The mask is loaded from a well known memory, depending on the shift amount.
2435
2436
this.asm
2437
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2438
2439
// Perform the 16-bit shift.
2440
match direction {
2441
Direction::Left => this.asm.xmm_vpsll_rrr(
2442
operand,
2443
tmp_xmm.inner(),
2444
writable!(operand),
2445
OperandSize::S16,
2446
),
2447
Direction::Right => this.asm.xmm_vpsrl_rrr(
2448
operand,
2449
tmp_xmm.inner(),
2450
writable!(operand),
2451
OperandSize::S16,
2452
),
2453
}
2454
2455
// Get a handle to the masks array constant.
2456
let masks_addr = this.asm.add_constant(masks);
2457
2458
// Load the masks array effective address into the tmp register.
2459
this.asm.lea(&masks_addr, tmp.writable(), OperandSize::S64);
2460
2461
// Compute the offset of the mask that we need to use. This is shift_amount * 16 ==
2462
// shift_amount << 4.
2463
this.asm
2464
.shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);
2465
2466
// Load the mask to tmp_xmm.
2467
this.asm.xmm_vmovdqu_mr(
2468
&Address::ImmRegRegShift {
2469
simm32: 0,
2470
base: tmp.inner(),
2471
index: shift_amount,
2472
shift: 0,
2473
},
2474
tmp_xmm.writable(),
2475
MemFlags::trusted(),
2476
);
2477
2478
// Mask unwanted bits from operand.
2479
this.asm
2480
.xmm_vpand_rrr(tmp_xmm.inner(), operand, writable!(operand));
2481
};
2482
2483
let i64x2_shr_s = |this: &mut Self,
2484
context: &mut CodeGenContext<Emission>|
2485
-> Result<()> {
2486
const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;
2487
2488
// AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the
2489
// following formula (from hacker's delight 2-7), where x is the value and n the shift
2490
// amount, for each lane:
2491
// t = (1 << 63) >> n; ((x >> n) ^ t) - t
2492
2493
// We need an extra scratch register:
2494
let tmp_xmm2 = context.any_fpr(this)?;
2495
2496
this.asm
2497
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2498
2499
let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());
2500
2501
this.asm
2502
.xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());
2503
this.asm.xmm_vpsrl_rrr(
2504
tmp_xmm2,
2505
tmp_xmm.inner(),
2506
writable!(tmp_xmm2),
2507
OperandSize::S64,
2508
);
2509
this.asm.xmm_vpsrl_rrr(
2510
operand,
2511
tmp_xmm.inner(),
2512
writable!(operand),
2513
OperandSize::S64,
2514
);
2515
this.asm
2516
.xmm_vpxor_rrr(operand, tmp_xmm2, writable!(operand));
2517
this.asm
2518
.xmm_vpsub_rrr(operand, tmp_xmm2, writable!(operand), OperandSize::S64);
2519
2520
context.free_reg(tmp_xmm2);
2521
2522
Ok(())
2523
};
2524
2525
let i8x16_shr_s = |this: &mut Self,
2526
context: &mut CodeGenContext<Emission>|
2527
-> Result<()> {
2528
// Since the x86 instruction set does not have an 8x16 shift instruction and the
2529
// approach used for `ishl` and `ushr` cannot be easily used (the masks do not
2530
// preserve the sign), we use a different approach here: separate the low and
2531
// high lanes, shift them separately, and merge them into the final result.
2532
//
2533
// Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
2534
// s15]:
2535
//
2536
// lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
2537
// shifted_lo.i16x8 = shift each lane of `low`
2538
// hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
2539
// shifted_hi.i16x8 = shift each lane of `high`
2540
// result = [s0'', s1'', ..., s15'']
2541
2542
// In order for `packsswb` later to only use the high byte of each
2543
// 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
2544
// fill in the upper bits appropriately.
2545
this.asm
2546
.add_ir(8, writable!(shift_amount), OperandSize::S32);
2547
this.asm
2548
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2549
2550
let tmp_lo = context.any_fpr(this)?;
2551
let tmp_hi = context.any_fpr(this)?;
2552
2553
// Extract lower and upper bytes.
2554
this.asm
2555
.xmm_vpunpckl_rrr(operand, operand, writable!(tmp_lo), OperandSize::S8);
2556
this.asm
2557
.xmm_vpunpckh_rrr(operand, operand, writable!(tmp_hi), OperandSize::S8);
2558
2559
// Perform 16bit right shift of upper and lower bytes.
2560
this.asm.xmm_vpsra_rrr(
2561
tmp_lo,
2562
tmp_xmm.inner(),
2563
writable!(tmp_lo),
2564
OperandSize::S16,
2565
);
2566
this.asm.xmm_vpsra_rrr(
2567
tmp_hi,
2568
tmp_xmm.inner(),
2569
writable!(tmp_hi),
2570
OperandSize::S16,
2571
);
2572
2573
// Merge lower and upper bytes back.
2574
this.asm
2575
.xmm_vpackss_rrr(tmp_lo, tmp_hi, writable!(operand), OperandSize::S8);
2576
2577
context.free_reg(tmp_lo);
2578
context.free_reg(tmp_hi);
2579
2580
Ok(())
2581
};
2582
2583
match (lane_width, kind) {
2584
// shl
2585
(OperandSize::S8, ShiftKind::Shl) => {
2586
shift_i8x16(masm, &I8X16_ISHL_MASKS, Direction::Left)
2587
}
2588
(OperandSize::S16, ShiftKind::Shl) => {
2589
move_to_tmp_xmm(masm);
2590
masm.asm.xmm_vpsll_rrr(
2591
operand,
2592
tmp_xmm.inner(),
2593
writable!(operand),
2594
OperandSize::S16,
2595
);
2596
}
2597
(OperandSize::S32, ShiftKind::Shl) => {
2598
move_to_tmp_xmm(masm);
2599
masm.asm.xmm_vpsll_rrr(
2600
operand,
2601
tmp_xmm.inner(),
2602
writable!(operand),
2603
OperandSize::S32,
2604
);
2605
}
2606
(OperandSize::S64, ShiftKind::Shl) => {
2607
move_to_tmp_xmm(masm);
2608
masm.asm.xmm_vpsll_rrr(
2609
operand,
2610
tmp_xmm.inner(),
2611
writable!(operand),
2612
OperandSize::S64,
2613
);
2614
}
2615
// shr_u
2616
(OperandSize::S8, ShiftKind::ShrU) => {
2617
shift_i8x16(masm, &I8X16_USHR_MASKS, Direction::Right)
2618
}
2619
(OperandSize::S16, ShiftKind::ShrU) => {
2620
move_to_tmp_xmm(masm);
2621
masm.asm.xmm_vpsrl_rrr(
2622
operand,
2623
tmp_xmm.inner(),
2624
writable!(operand),
2625
OperandSize::S16,
2626
);
2627
}
2628
(OperandSize::S32, ShiftKind::ShrU) => {
2629
move_to_tmp_xmm(masm);
2630
masm.asm.xmm_vpsrl_rrr(
2631
operand,
2632
tmp_xmm.inner(),
2633
writable!(operand),
2634
OperandSize::S32,
2635
);
2636
}
2637
(OperandSize::S64, ShiftKind::ShrU) => {
2638
move_to_tmp_xmm(masm);
2639
masm.asm.xmm_vpsrl_rrr(
2640
operand,
2641
tmp_xmm.inner(),
2642
writable!(operand),
2643
OperandSize::S64,
2644
);
2645
}
2646
// shr_s
2647
(OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(masm, context)?,
2648
(OperandSize::S16, ShiftKind::ShrS) => {
2649
move_to_tmp_xmm(masm);
2650
masm.asm.xmm_vpsra_rrr(
2651
operand,
2652
tmp_xmm.inner(),
2653
writable!(operand),
2654
OperandSize::S16,
2655
);
2656
}
2657
(OperandSize::S32, ShiftKind::ShrS) => {
2658
move_to_tmp_xmm(masm);
2659
masm.asm.xmm_vpsra_rrr(
2660
operand,
2661
tmp_xmm.inner(),
2662
writable!(operand),
2663
OperandSize::S32,
2664
);
2665
}
2666
(OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(masm, context)?,
2667
2668
_ => bail!(CodeGenError::invalid_operand_combination()),
2669
}
2670
2671
Ok(())
2672
})
2673
})?;
2674
2675
context.free_reg(shift_amount);
2676
context
2677
.stack
2678
.push(TypedReg::new(WasmValType::V128, operand).into());
2679
Ok(())
2680
}
2681
2682
fn v128_q15mulr_sat_s(
2683
&mut self,
2684
lhs: Reg,
2685
rhs: Reg,
2686
dst: WritableReg,
2687
size: OperandSize,
2688
) -> Result<()> {
2689
self.ensure_has_avx()?;
2690
2691
self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);
2692
2693
// Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15
2694
// format) because of how `vpmulhrs` handles rounding. `vpmulhrs`
2695
// produces 0x8000 in that case when the correct result is 0x7FFF (that
2696
// is, +1) so need to check if the result is 0x8000 and flip the bits
2697
// of the result if it is.
2698
let address = self.asm.add_constant(&[
2699
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
2700
0x00, 0x80,
2701
]);
2702
self.asm
2703
.xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
2704
self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
2705
Ok(())
2706
}
2707
2708
fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2709
self.ensure_has_avx()?;
2710
2711
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2712
// Create a mask of all 0s.
2713
masm.asm
2714
.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2715
// Sets lane in `dst` to not zero if `src` lane was zero, and lane in
2716
// `dst` to zero if `src` lane was not zero.
2717
masm.asm
2718
.xmm_vpcmpeq_rrr(writable!(src), src, scratch.inner(), size);
2719
// Sets ZF if all values are zero (i.e., if all original values were not zero).
2720
masm.asm.xmm_vptest(src, src);
2721
// Set byte if ZF=1.
2722
});
2723
self.asm.setcc(IntCmpKind::Eq, dst);
2724
Ok(())
2725
}
2726
2727
fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2728
self.ensure_has_avx()?;
2729
2730
match size {
2731
OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),
2732
OperandSize::S16 => {
2733
// Signed conversion of 16-bit integers to 8-bit integers.
2734
self.asm
2735
.xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);
2736
// Creates a mask from each byte in `src`.
2737
self.asm
2738
.xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);
2739
// Removes 8 bits added as a result of the `vpackss` step.
2740
self.asm
2741
.shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);
2742
}
2743
OperandSize::S32 | OperandSize::S64 => {
2744
self.asm.xmm_vmovskp_rr(src, dst, size, OperandSize::S32)
2745
}
2746
_ => unimplemented!(),
2747
}
2748
2749
Ok(())
2750
}
2751
2752
fn v128_trunc(
2753
&mut self,
2754
context: &mut CodeGenContext<Emission>,
2755
kind: V128TruncKind,
2756
) -> Result<()> {
2757
self.ensure_has_avx()?;
2758
2759
let reg = writable!(context.pop_to_reg(self, None)?.reg);
2760
match kind {
2761
V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(
2762
reg.to_reg(),
2763
reg,
2764
VroundMode::TowardZero,
2765
kind.dst_lane_size(),
2766
),
2767
V128TruncKind::I32x4FromF32x4S => {
2768
self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2769
}
2770
V128TruncKind::I32x4FromF32x4U => {
2771
let temp_reg = writable!(context.any_fpr(self)?);
2772
self.v128_trunc_sat_f32x4_u(
2773
reg,
2774
temp_reg,
2775
kind.src_lane_size(),
2776
kind.dst_lane_size(),
2777
)?;
2778
context.free_reg(temp_reg.to_reg());
2779
}
2780
V128TruncKind::I32x4FromF64x2SZero => {
2781
self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size())?;
2782
}
2783
V128TruncKind::I32x4FromF64x2UZero => {
2784
self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2785
}
2786
}
2787
2788
context.stack.push(TypedReg::v128(reg.to_reg()).into());
2789
Ok(())
2790
}
2791
2792
fn v128_min(
2793
&mut self,
2794
src1: Reg,
2795
src2: Reg,
2796
dst: WritableReg,
2797
kind: V128MinKind,
2798
) -> Result<()> {
2799
self.ensure_has_avx()?;
2800
2801
match kind {
2802
V128MinKind::I8x16S
2803
| V128MinKind::I8x16U
2804
| V128MinKind::I16x8S
2805
| V128MinKind::I16x8U
2806
| V128MinKind::I32x4S
2807
| V128MinKind::I32x4U => {
2808
match kind {
2809
V128MinKind::I8x16S => {
2810
self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S8)
2811
}
2812
V128MinKind::I8x16U => {
2813
self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S8)
2814
}
2815
V128MinKind::I16x8S => {
2816
self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S16)
2817
}
2818
V128MinKind::I16x8U => {
2819
self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S16)
2820
}
2821
V128MinKind::I32x4S => {
2822
self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S32)
2823
}
2824
V128MinKind::I32x4U => {
2825
self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S32)
2826
}
2827
_ => unreachable!(),
2828
};
2829
}
2830
V128MinKind::F32x4 | V128MinKind::F64x2 => {
2831
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2832
// Handling +0 and -0 as well as NaN values are not commutative
2833
// when using `vminp` so we have to compensate.
2834
// Perform two comparison operations with the operands swapped
2835
// and OR the result to propagate 0 (positive and negative) and
2836
// NaN.
2837
masm.asm
2838
.xmm_vminp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2839
masm.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());
2840
// Use a single OR instruction to set the sign bit if either
2841
// result has the sign bit set to correctly propagate -0.
2842
masm.asm
2843
.xmm_vorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2844
});
2845
// Set lanes with NaN to all 1s.
2846
self.asm.xmm_vcmpp_rrr(
2847
writable!(src2),
2848
src2,
2849
dst.to_reg(),
2850
kind.lane_size(),
2851
VcmpKind::Unord,
2852
);
2853
// Doesn't change non-NaN values. For NaN values, sets all bits.
2854
self.asm
2855
.xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2856
self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2857
}
2858
}
2859
2860
Ok(())
2861
}
2862
2863
fn v128_max(
2864
&mut self,
2865
src1: Reg,
2866
src2: Reg,
2867
dst: WritableReg,
2868
kind: V128MaxKind,
2869
) -> Result<()> {
2870
self.ensure_has_avx()?;
2871
2872
match kind {
2873
V128MaxKind::I8x16S
2874
| V128MaxKind::I8x16U
2875
| V128MaxKind::I16x8S
2876
| V128MaxKind::I16x8U
2877
| V128MaxKind::I32x4S
2878
| V128MaxKind::I32x4U => {
2879
match kind {
2880
V128MaxKind::I8x16S => {
2881
self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S8)
2882
}
2883
V128MaxKind::I8x16U => {
2884
self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S8)
2885
}
2886
V128MaxKind::I16x8S => {
2887
self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S16)
2888
}
2889
V128MaxKind::I16x8U => {
2890
self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S16)
2891
}
2892
V128MaxKind::I32x4S => {
2893
self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S32)
2894
}
2895
V128MaxKind::I32x4U => {
2896
self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S32)
2897
}
2898
_ => unreachable!(),
2899
};
2900
}
2901
V128MaxKind::F32x4 | V128MaxKind::F64x2 => {
2902
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2903
// Handling +0 and -0 as well as NaN values are not commutative
2904
// when using `vmaxp` so we have to compensate.
2905
// Perform two comparison operations with the operands swapped
2906
// so we can propagate 0 (positive and negative) and NaNs
2907
// correctly.
2908
2909
masm.asm
2910
.xmm_vmaxp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2911
masm.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());
2912
// This combination of XOR, OR, and SUB will set the sign bit
2913
// on a 0 result to the correct value for a max operation.
2914
masm.asm
2915
.xmm_vxorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2916
masm.asm.xmm_vorp_rrr(
2917
dst.to_reg(),
2918
scratch.inner(),
2919
writable!(src2),
2920
kind.lane_size(),
2921
);
2922
});
2923
self.asm
2924
.xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2925
// Set lanes of NaN values to 1.
2926
self.asm.xmm_vcmpp_rrr(
2927
writable!(src2),
2928
src2,
2929
src2,
2930
kind.lane_size(),
2931
VcmpKind::Unord,
2932
);
2933
self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2934
}
2935
}
2936
Ok(())
2937
}
2938
2939
fn v128_extmul(
2940
&mut self,
2941
context: &mut CodeGenContext<Emission>,
2942
kind: V128ExtMulKind,
2943
) -> Result<()> {
2944
self.ensure_has_avx()?;
2945
2946
// The implementation for extmul is not optimized; for simplicity's sake, we simply perform
2947
// an extension followed by a multiplication using already implemented primitives.
2948
2949
let src1 = context.pop_to_reg(self, None)?;
2950
let src2 = context.pop_to_reg(self, None)?;
2951
2952
let ext_kind = kind.into();
2953
self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;
2954
self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;
2955
2956
context.stack.push(src2.into());
2957
context.stack.push(src1.into());
2958
2959
self.v128_mul(context, kind.into())
2960
}
2961
2962
fn v128_extadd_pairwise(
2963
&mut self,
2964
src: Reg,
2965
dst: WritableReg,
2966
kind: V128ExtAddKind,
2967
) -> Result<()> {
2968
self.ensure_has_avx()?;
2969
2970
match kind {
2971
V128ExtAddKind::I8x16S => {
2972
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2973
// Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
2974
// sign extend `src` to 16 bits and add adjacent words.
2975
// Need to supply constant as first operand since first operand
2976
// is treated as unsigned and the second operand is signed.
2977
let mask = masm.asm.add_constant(&[1; 16]);
2978
masm.asm.xmm_mov_mr(
2979
&mask,
2980
scratch.writable(),
2981
OperandSize::S128,
2982
MemFlags::trusted(),
2983
);
2984
masm.asm.xmm_vpmaddubsw_rrr(scratch.inner(), src, dst);
2985
});
2986
}
2987
V128ExtAddKind::I8x16U => {
2988
// Same approach as the signed variant but treat `src` as
2989
// unsigned instead of signed by passing it as the first
2990
// operand.
2991
let mask = self.asm.add_constant(&[1; 16]);
2992
self.asm.xmm_vpmaddubsw_rmr(src, &mask, dst);
2993
}
2994
V128ExtAddKind::I16x8S => {
2995
// Similar approach to the two variants above. The vector is 8
2996
// lanes of 16-bit 1's and `vpmaddwd` treats both operands as
2997
// signed.
2998
let mask = self
2999
.asm
3000
.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3001
self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
3002
}
3003
V128ExtAddKind::I16x8U => {
3004
// Similar approach as the signed variant.
3005
// `vpmaddwd` operates on signed integers and the operand is
3006
// unsigned so the operand needs to be converted to a signed
3007
// format and than that process needs to be reversed after
3008
// `vpmaddwd`.
3009
// Flip the sign bit for 8 16-bit lanes.
3010
let xor_mask = self.asm.add_constant(&[
3011
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
3012
0x80, 0x00, 0x80,
3013
]);
3014
self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
3015
3016
let madd_mask = self
3017
.asm
3018
.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3019
self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
3020
3021
// Reverse the XOR. The XOR effectively subtracts 32,768 from
3022
// both pairs that are added together so 65,536 (0x10000)
3023
// needs to be added to 4 lanes of 32-bit values.
3024
let add_mask = self
3025
.asm
3026
.add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
3027
self.asm
3028
.xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
3029
}
3030
}
3031
Ok(())
3032
}
3033
3034
fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {
3035
self.ensure_has_avx()?;
3036
self.asm.xmm_vpmaddwd_rrr(lhs, rhs, dst);
3037
Ok(())
3038
}
3039
3040
fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
3041
self.ensure_has_avx()?;
3042
3043
let reg = writable!(context.pop_to_reg(self, None)?.reg);
3044
3045
// This works by using a lookup table to determine the count of bits
3046
// set in the upper 4 bits and lower 4 bits separately and then adding
3047
// the counts.
3048
3049
// A mask to zero out the upper 4 bits in each lane.
3050
let address = self.asm.add_constant(&[
3051
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
3052
0x0F, 0x0F,
3053
]);
3054
3055
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3056
// Zero out the upper 4 bits of each lane.
3057
masm.asm
3058
.xmm_vpand_rrm(reg.to_reg(), &address, scratch.writable());
3059
// Right shift bytes in input by 4 bits to put the upper 4 bits in the
3060
// lower 4 bits.
3061
masm.asm
3062
.xmm_vpsrl_rri(reg.to_reg(), reg, 0x4, OperandSize::S16);
3063
// Zero out the upper 4 bits of each shifted lane.
3064
masm.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
3065
3066
// Write a lookup table of 4 bit values to number of bits set to a
3067
// register so we only perform the memory read once.
3068
// Index (hex) | Value (binary) | Population Count
3069
// 0x0 | 0000 | 0
3070
// 0x1 | 0001 | 1
3071
// 0x2 | 0010 | 1
3072
// 0x3 | 0011 | 2
3073
// 0x4 | 0100 | 1
3074
// 0x5 | 0101 | 2
3075
// 0x6 | 0110 | 2
3076
// 0x7 | 0111 | 3
3077
// 0x8 | 1000 | 1
3078
// 0x9 | 1001 | 2
3079
// 0xA | 1010 | 2
3080
// 0xB | 1011 | 3
3081
// 0xC | 1100 | 2
3082
// 0xD | 1101 | 3
3083
// 0xE | 1110 | 3
3084
// 0xF | 1111 | 4
3085
let address = masm.asm.add_constant(&[
3086
0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
3087
]);
3088
let reg2 = writable!(context.any_fpr(masm)?);
3089
masm.asm
3090
.xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
3091
// Use the upper 4 bits as an index into the lookup table.
3092
masm.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
3093
// Use the lower 4 bits as an index into the lookup table.
3094
masm.asm
3095
.xmm_vpshufb_rrr(scratch.writable(), reg2.to_reg(), scratch.inner());
3096
context.free_reg(reg2.to_reg());
3097
3098
// Add the counts of the upper 4 bits and the lower 4 bits to get the
3099
// total number of bits set.
3100
masm.asm
3101
.xmm_vpadd_rrr(reg.to_reg(), scratch.inner(), reg, OperandSize::S8);
3102
anyhow::Ok(())
3103
})?;
3104
3105
context.stack.push(TypedReg::v128(reg.to_reg()).into());
3106
Ok(())
3107
}
3108
3109
fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3110
self.ensure_has_avx()?;
3111
self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
3112
Ok(())
3113
}
3114
3115
fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3116
self.ensure_has_avx()?;
3117
self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);
3118
Ok(())
3119
}
3120
3121
fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3122
self.ensure_has_avx()?;
3123
self.asm.xmm_vsqrtp_rr(src, dst, size);
3124
Ok(())
3125
}
3126
3127
fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3128
self.ensure_has_avx()?;
3129
self.asm
3130
.xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);
3131
Ok(())
3132
}
3133
3134
fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3135
self.ensure_has_avx()?;
3136
self.asm
3137
.xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);
3138
Ok(())
3139
}
3140
3141
fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3142
self.ensure_has_avx()?;
3143
self.asm
3144
.xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);
3145
Ok(())
3146
}
3147
3148
fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3149
self.ensure_has_avx()?;
3150
// Reverse operands since Wasm specifies returning the first operand if
3151
// either operand is NaN while x86 returns the second operand.
3152
self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);
3153
Ok(())
3154
}
3155
3156
fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3157
self.ensure_has_avx()?;
3158
// Reverse operands since Wasm specifies returning the first operand if
3159
// either operand is NaN while x86 returns the second operand.
3160
self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);
3161
Ok(())
3162
}
3163
}
3164
3165
impl MacroAssembler {
3166
/// Create an x64 MacroAssembler.
3167
pub fn new(
3168
ptr_size: impl PtrSize,
3169
shared_flags: settings::Flags,
3170
isa_flags: x64_settings::Flags,
3171
) -> Result<Self> {
3172
let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size());
3173
3174
Ok(Self {
3175
sp_offset: 0,
3176
sp_max: 0,
3177
stack_max_use_add: None,
3178
asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),
3179
flags: isa_flags,
3180
shared_flags,
3181
ptr_size: ptr_type.try_into()?,
3182
scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),
3183
})
3184
}
3185
3186
/// Add the maximum stack used to a register, recording an obligation to update the
3187
/// add-with-immediate instruction emitted to use the real stack max when the masm is being
3188
/// finalized.
3189
fn add_stack_max(&mut self, reg: Reg) {
3190
assert!(self.stack_max_use_add.is_none());
3191
let patch = PatchableAddToReg::new(reg, OperandSize::S64, &mut self.asm);
3192
self.stack_max_use_add.replace(patch);
3193
}
3194
3195
fn ensure_has_avx(&self) -> Result<()> {
3196
anyhow::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);
3197
Ok(())
3198
}
3199
3200
fn ensure_has_avx2(&self) -> Result<()> {
3201
anyhow::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);
3202
Ok(())
3203
}
3204
3205
fn ensure_has_avx512vl(&self) -> Result<()> {
3206
anyhow::ensure!(
3207
self.flags.has_avx512vl(),
3208
CodeGenError::UnimplementedForNoAvx512VL
3209
);
3210
Ok(())
3211
}
3212
3213
fn ensure_has_avx512dq(&self) -> Result<()> {
3214
anyhow::ensure!(
3215
self.flags.has_avx512dq(),
3216
CodeGenError::UnimplementedForNoAvx512DQ
3217
);
3218
Ok(())
3219
}
3220
3221
fn increment_sp(&mut self, bytes: u32) {
3222
self.sp_offset += bytes;
3223
3224
// NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have
3225
// seen the entire function, this value will represent the maximum size for the stack
3226
// frame.
3227
self.sp_max = self.sp_max.max(self.sp_offset);
3228
}
3229
3230
fn decrement_sp(&mut self, bytes: u32) {
3231
assert!(
3232
self.sp_offset >= bytes,
3233
"sp offset = {}; bytes = {}",
3234
self.sp_offset,
3235
bytes
3236
);
3237
self.sp_offset -= bytes;
3238
}
3239
3240
fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {
3241
match constant {
3242
I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),
3243
I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),
3244
I::F32(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3245
I::F64(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3246
I::V128(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3247
}
3248
}
3249
3250
/// A common implementation for zero-extend stack loads.
3251
fn load_impl(
3252
&mut self,
3253
src: Address,
3254
dst: WritableReg,
3255
size: OperandSize,
3256
flags: MemFlags,
3257
) -> Result<()> {
3258
if dst.to_reg().is_int() {
3259
let ext = size.extend_to::<Zero>(OperandSize::S64);
3260
self.asm.movzx_mr(&src, dst, ext, flags);
3261
} else {
3262
self.asm.xmm_mov_mr(&src, dst, size, flags);
3263
}
3264
3265
Ok(())
3266
}
3267
3268
/// A common implementation for stack stores.
3269
fn store_impl(
3270
&mut self,
3271
src: RegImm,
3272
dst: Address,
3273
size: OperandSize,
3274
flags: MemFlags,
3275
) -> Result<()> {
3276
let _ = match src {
3277
RegImm::Imm(imm) => match imm {
3278
I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),
3279
I::I64(v) => match v.try_into() {
3280
Ok(v) => self.asm.mov_im(v, &dst, size, flags),
3281
Err(_) => {
3282
// If the immediate doesn't sign extend, use a scratch
3283
// register.
3284
self.with_scratch::<IntScratch, _>(|masm, scratch| {
3285
masm.asm.mov_ir(v, scratch.writable(), size);
3286
masm.asm.mov_rm(scratch.inner(), &dst, size, flags);
3287
});
3288
}
3289
},
3290
I::F32(v) => {
3291
let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3292
self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3293
// Always trusted, since we are loading the constant from
3294
// the constant pool.
3295
masm.asm.xmm_mov_mr(
3296
&addr,
3297
float_scratch.writable(),
3298
size,
3299
MemFlags::trusted(),
3300
);
3301
masm.asm
3302
.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3303
});
3304
}
3305
I::F64(v) => {
3306
let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3307
3308
self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3309
// Similar to above, always trusted since we are loading the
3310
// constant from the constant pool.
3311
masm.asm.xmm_mov_mr(
3312
&addr,
3313
float_scratch.writable(),
3314
size,
3315
MemFlags::trusted(),
3316
);
3317
masm.asm
3318
.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3319
});
3320
}
3321
I::V128(v) => {
3322
let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3323
self.with_scratch::<FloatScratch, _>(|masm, vector_scratch| {
3324
// Always trusted, since we are loading the constant from
3325
// the constant pool.
3326
masm.asm.xmm_mov_mr(
3327
&addr,
3328
vector_scratch.writable(),
3329
size,
3330
MemFlags::trusted(),
3331
);
3332
masm.asm
3333
.xmm_mov_rm(vector_scratch.inner(), &dst, size, flags);
3334
});
3335
}
3336
},
3337
RegImm::Reg(reg) => {
3338
if reg.is_int() {
3339
self.asm.mov_rm(reg, &dst, size, flags);
3340
} else {
3341
self.asm.xmm_mov_rm(reg, &dst, size, flags);
3342
}
3343
}
3344
};
3345
Ok(())
3346
}
3347
3348
fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {
3349
if dst != lhs {
3350
Err(anyhow!(CodeGenError::invalid_two_arg_form()))
3351
} else {
3352
Ok(())
3353
}
3354
}
3355
3356
/// The mask to use when performing a `vpshuf` operation for a 64-bit splat.
3357
fn vpshuf_mask_for_64_bit_splats() -> u8 {
3358
// Results in the first 4 bytes and second 4 bytes being
3359
// swapped and then the swapped bytes being copied.
3360
// [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields
3361
// [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3].
3362
0b01_00_01_00
3363
}
3364
3365
fn v128_trunc_sat_f32x4_s(
3366
&mut self,
3367
reg: WritableReg,
3368
src_lane_size: OperandSize,
3369
dst_lane_size: OperandSize,
3370
) -> Result<()> {
3371
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3372
// Create a mask to handle NaN values (1 for not NaN, 0 for
3373
// NaN).
3374
masm.asm.xmm_vcmpp_rrr(
3375
scratch.writable(),
3376
reg.to_reg(),
3377
reg.to_reg(),
3378
src_lane_size,
3379
VcmpKind::Eq,
3380
);
3381
// Zero out any NaN values.
3382
masm.asm
3383
.xmm_vandp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3384
// Create a mask for the sign bits.
3385
masm.asm
3386
.xmm_vpxor_rrr(scratch.inner(), reg.to_reg(), scratch.writable());
3387
// Convert floats to integers.
3388
masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3389
// Apply sign mask to the converted integers.
3390
masm.asm
3391
.xmm_vpand_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3392
// Create a saturation mask of all 1s for negative numbers,
3393
// all 0s for positive numbers. The arithmetic shift will cop
3394
// the sign bit.
3395
masm.asm
3396
.xmm_vpsra_rri(scratch.inner(), scratch.writable(), 0x1F, dst_lane_size);
3397
// Combine converted integers with saturation mask.
3398
masm.asm.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), reg);
3399
Ok(())
3400
})
3401
}
3402
3403
fn v128_trunc_sat_f32x4_u(
3404
&mut self,
3405
reg: WritableReg,
3406
temp_reg: WritableReg,
3407
src_lane_size: OperandSize,
3408
dst_lane_size: OperandSize,
3409
) -> Result<()> {
3410
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3411
// Set scratch to all zeros.
3412
masm.asm.xmm_vxorp_rrr(
3413
reg.to_reg(),
3414
reg.to_reg(),
3415
scratch.writable(),
3416
src_lane_size,
3417
);
3418
// Clamp negative numbers to 0.
3419
masm.asm
3420
.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3421
// Create a vector of all 1s.
3422
masm.asm.xmm_vpcmpeq_rrr(
3423
scratch.writable(),
3424
scratch.inner(),
3425
scratch.inner(),
3426
src_lane_size,
3427
);
3428
// Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by
3429
// performing a logical shift right.
3430
masm.asm
3431
.xmm_vpsrl_rri(scratch.inner(), scratch.writable(), 0x1, src_lane_size);
3432
// Convert max signed int to float as a reference point for saturation.
3433
masm.asm
3434
.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
3435
// Convert the floats to integers and put the results in `reg2`.
3436
// This is signed and not unsigned so we need to handle the
3437
// value for the high bit in each lane.
3438
masm.asm
3439
.xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);
3440
// Set `reg` lanes to the amount that the value in the lane
3441
// exceeds the maximum signed 32-bit integer.
3442
masm.asm
3443
.xmm_vsub_rrr(reg.to_reg(), scratch.inner(), reg, dst_lane_size);
3444
// Create mask in `scratch` for numbers that are larger than
3445
// the maximum signed 32-bit integer. Lanes that don't fit
3446
// in 32-bits ints will be 1.
3447
masm.asm.xmm_vcmpp_rrr(
3448
scratch.writable(),
3449
scratch.inner(),
3450
reg.to_reg(),
3451
dst_lane_size,
3452
VcmpKind::Le,
3453
);
3454
// Convert the excess over signed 32-bits from floats to integers.
3455
masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3456
// Apply large number mask to excess values which will flip the
3457
// bits in any lanes that exceed signed 32-bits. Adding this
3458
// flipped value to the signed value will set the high bit and
3459
// the carry behavior will update the other bits correctly.
3460
masm.asm
3461
.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3462
// Set `reg` to all 0s.
3463
masm.asm.xmm_vpxor_rrr(reg.to_reg(), reg.to_reg(), reg);
3464
// Ensure excess values are not negative by taking max b/w
3465
// excess values and zero.
3466
masm.asm
3467
.xmm_vpmaxs_rrr(reg, scratch.inner(), reg.to_reg(), dst_lane_size);
3468
});
3469
// Perform the addition between the signed conversion value (in
3470
// `reg2`) and the flipped excess value (in `reg`) to get the
3471
// unsigned value.
3472
self.asm
3473
.xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);
3474
Ok(())
3475
}
3476
3477
fn v128_trunc_sat_f64x2_s_zero(
3478
&mut self,
3479
reg: WritableReg,
3480
src_lane_size: OperandSize,
3481
) -> Result<()> {
3482
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3483
// Create a NaN mask (1s for non-NaN, 0s for NaN).
3484
masm.asm.xmm_vcmpp_rrr(
3485
scratch.writable(),
3486
reg.to_reg(),
3487
reg.to_reg(),
3488
src_lane_size,
3489
VcmpKind::Eq,
3490
);
3491
// Clamp NaN values to maximum 64-bit float that can be
3492
// converted to an i32.
3493
let address = masm.asm.add_constant(&[
3494
0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,
3495
0xDF, 0x41,
3496
]);
3497
masm.asm
3498
.xmm_vandp_rrm(scratch.inner(), &address, scratch.writable(), src_lane_size);
3499
// Handle the saturation for values too large to fit in an i32.
3500
masm.asm
3501
.xmm_vminp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3502
// Convert the floats to integers.
3503
masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);
3504
3505
Ok(())
3506
})
3507
}
3508
3509
fn v128_trunc_sat_f64x2_u_zero(
3510
&mut self,
3511
reg: WritableReg,
3512
src_lane_size: OperandSize,
3513
dst_lane_size: OperandSize,
3514
) -> Result<()> {
3515
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3516
// Zero out the scratch register.
3517
masm.asm.xmm_vxorp_rrr(
3518
scratch.inner(),
3519
scratch.inner(),
3520
scratch.writable(),
3521
src_lane_size,
3522
);
3523
// Clamp negative values to zero.
3524
masm.asm
3525
.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3526
// Clamp value to maximum unsigned 32-bit integer value
3527
// (0x41F0000000000000).
3528
let address = masm.asm.add_constant(&[
3529
0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,
3530
0xEF, 0x41,
3531
]);
3532
masm.asm
3533
.xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3534
// Truncate floating point values.
3535
masm.asm
3536
.xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);
3537
// Add 2^52 (doubles store 52 bits in their mantissa) to each
3538
// lane causing values in the lower bits to be shifted into
3539
// position for integer conversion.
3540
let address = masm.asm.add_constant(&[
3541
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3542
0x30, 0x43,
3543
]);
3544
masm.asm
3545
.xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3546
// Takes lanes 0 and 2 from `reg` (converted values) and lanes
3547
// 0 and 2 from `scratch` (zeroes) to put the converted ints in
3548
// the lower lanes and zeroes in the upper lanes.
3549
masm.asm.xmm_vshufp_rrri(
3550
reg.to_reg(),
3551
scratch.inner(),
3552
reg,
3553
0b10_00_10_00,
3554
dst_lane_size,
3555
);
3556
Ok(())
3557
})
3558
}
3559
3560
/// Given a vector of floats where lanes with NaN values are set to all 1s
3561
/// in `reg` and a vector register `dst` with a mix of non-NaN values and
3562
/// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`.
3563
fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {
3564
// Canonical NaNs do not preserve the sign bit, have the exponent bits
3565
// all set, and have only the high bit of the mantissa set so shift by
3566
// that number.
3567
// The mask we're producing in this step will be inverted in the next
3568
// step.
3569
let amount_to_shift = 1 + size.mantissa_bits() + 1;
3570
self.asm
3571
.xmm_vpsrl_rri(mask.to_reg(), mask, amount_to_shift as u32, size);
3572
// The mask will be inverted by the ANDN so non-NaN values will be all
3573
// 1s and NaN values will set the sign bit, exponent bits, and zero out
3574
// almost all of the mantissa.
3575
self.asm
3576
.xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);
3577
}
3578
}
3579
3580