Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bytecodealliance
GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/winch/codegen/src/isa/x64/masm.rs
3050 views
1
use super::{
2
RegAlloc,
3
abi::X64ABI,
4
address::Address,
5
asm::{Assembler, PatchableAddToReg, VcmpKind, VcvtKind, VroundMode},
6
regs::{self, rbp, rsp, scratch_fpr_bitset, scratch_gpr_bitset},
7
};
8
use crate::masm::{
9
DivKind, Extend, ExtendKind, ExtractLaneKind, FloatCmpKind, FloatScratch, Imm as I, IntCmpKind,
10
IntScratch, LaneSelector, LoadKind, MacroAssembler as Masm, MulWideKind, OperandSize, RegImm,
11
RemKind, ReplaceLaneKind, RmwOp, RoundingMode, Scratch, ScratchType, ShiftKind, SplatKind,
12
StoreKind, TRUSTED_FLAGS, TrapCode, TruncKind, UNTRUSTED_FLAGS, V128AbsKind, V128AddKind,
13
V128ConvertKind, V128ExtAddKind, V128ExtMulKind, V128ExtendKind, V128MaxKind, V128MinKind,
14
V128MulKind, V128NarrowKind, V128NegKind, V128SubKind, V128TruncKind, VectorCompareKind,
15
VectorEqualityKind, Zero,
16
};
17
use crate::{
18
Result,
19
abi::{self, LocalSlot, align_to, calculate_frame_adjustment},
20
bail,
21
codegen::{CodeGenContext, CodeGenError, Emission, FuncEnv, ptr_type_from_ptr_size},
22
format_err,
23
stack::{TypedReg, Val},
24
};
25
use crate::{
26
abi::{ABI, vmctx},
27
masm::{SPOffset, StackSlot},
28
};
29
use crate::{
30
isa::{
31
CallingConvention,
32
reg::{Reg, RegClass, WritableReg, writable},
33
},
34
masm::CalleeKind,
35
};
36
use cranelift_codegen::{
37
Final, MachBufferFinalized, MachLabel,
38
binemit::CodeOffset,
39
ir::{MemFlags, RelSourceLoc, SourceLoc},
40
isa::{
41
unwind::UnwindInst,
42
x64::{AtomicRmwSeqOp, args::CC, settings as x64_settings},
43
},
44
settings,
45
};
46
use wasmtime_cranelift::TRAP_UNREACHABLE;
47
use wasmtime_environ::{PtrSize, WasmValType};
48
49
// Taken from `cranelift/codegen/src/isa/x64/lower/isle.rs`
50
// Since x64 doesn't have 8x16 shifts and we must use a 16x8 shift instead, we
51
// need to fix up the bits that migrate from one half of the lane to the
52
// other. Each 16-byte mask is indexed by the shift amount: e.g. if we shift
53
// right by 0 (no movement), we want to retain all the bits so we mask with
54
// `0xff`; if we shift right by 1, we want to retain all bits except the MSB so
55
// we mask with `0x7f`; etc.
56
57
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
58
const I8X16_ISHL_MASKS: [u8; 128] = [
59
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
60
0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe,
61
0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc,
62
0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8,
63
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
64
0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0, 0xe0,
65
0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0,
66
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
67
];
68
69
#[rustfmt::skip] // Preserve 16 bytes (i.e. one mask) per row.
70
const I8X16_USHR_MASKS: [u8; 128] = [
71
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
73
0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f,
74
0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
75
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
76
0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
77
0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
78
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
79
];
80
81
/// x64 MacroAssembler.
82
pub(crate) struct MacroAssembler {
83
/// Stack pointer offset.
84
sp_offset: u32,
85
/// This value represents the maximum stack size seen while compiling the function. While the
86
/// function is still being compiled its value will not be valid (the stack will grow and
87
/// shrink as space is reserved and freed during compilation), but once all instructions have
88
/// been seen this value will be the maximum stack usage seen.
89
sp_max: u32,
90
/// Add instructions that are used to add the constant stack max to a register.
91
stack_max_use_add: Option<PatchableAddToReg>,
92
/// Low level assembler.
93
asm: Assembler,
94
/// ISA flags.
95
flags: x64_settings::Flags,
96
/// Shared flags.vmcontext_store_context
97
shared_flags: settings::Flags,
98
/// The target pointer size.
99
ptr_size: OperandSize,
100
/// Scratch register scope.
101
scratch_scope: RegAlloc,
102
}
103
104
impl Masm for MacroAssembler {
105
type Address = Address;
106
type Ptr = u8;
107
type ABI = X64ABI;
108
109
fn frame_setup(&mut self) -> Result<()> {
110
let frame_pointer = rbp();
111
let stack_pointer = rsp();
112
113
self.asm.push_r(frame_pointer);
114
115
if self.shared_flags.unwind_info() {
116
self.asm.unwind_inst(UnwindInst::PushFrameRegs {
117
offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
118
})
119
}
120
121
self.asm
122
.mov_rr(stack_pointer, writable!(frame_pointer), OperandSize::S64);
123
124
Ok(())
125
}
126
127
fn check_stack(&mut self, vmctx: Reg) -> Result<()> {
128
let ptr_size: u8 = self.ptr_size.bytes().try_into().unwrap();
129
130
self.with_scratch::<IntScratch, _>(|masm, scratch| {
131
masm.load_ptr(
132
masm.address_at_reg(vmctx, ptr_size.vmcontext_store_context().into())?,
133
scratch.writable(),
134
)?;
135
136
masm.load_ptr(
137
Address::offset(
138
scratch.inner(),
139
ptr_size.vmstore_context_stack_limit().into(),
140
),
141
scratch.writable(),
142
)?;
143
144
masm.add_stack_max(scratch.inner());
145
146
masm.asm.cmp_rr(scratch.inner(), regs::rsp(), masm.ptr_size);
147
masm.asm.trapif(IntCmpKind::GtU, TrapCode::STACK_OVERFLOW);
148
wasmtime_environ::error::Ok(())
149
})?;
150
151
// Emit unwind info.
152
if self.shared_flags.unwind_info() {
153
self.asm.unwind_inst(UnwindInst::DefineNewFrame {
154
offset_upward_to_caller_sp: Self::ABI::arg_base_offset().into(),
155
156
// The Winch calling convention has no callee-save registers, so nothing will be
157
// clobbered.
158
offset_downward_to_clobbers: 0,
159
})
160
}
161
Ok(())
162
}
163
164
fn push(&mut self, reg: Reg, size: OperandSize) -> Result<StackSlot> {
165
let bytes = match (reg.class(), size) {
166
(RegClass::Int, OperandSize::S64) => {
167
let word_bytes = <Self::ABI as ABI>::word_bytes() as u32;
168
self.asm.push_r(reg);
169
self.increment_sp(word_bytes);
170
word_bytes
171
}
172
(RegClass::Int, OperandSize::S32) => {
173
let bytes = size.bytes();
174
self.reserve_stack(bytes)?;
175
let sp_offset = SPOffset::from_u32(self.sp_offset);
176
self.asm
177
.mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
178
bytes
179
}
180
(RegClass::Float, _) => {
181
let bytes = size.bytes();
182
self.reserve_stack(bytes)?;
183
let sp_offset = SPOffset::from_u32(self.sp_offset);
184
self.asm
185
.xmm_mov_rm(reg, &self.address_from_sp(sp_offset)?, size, TRUSTED_FLAGS);
186
bytes
187
}
188
_ => unreachable!(),
189
};
190
191
Ok(StackSlot {
192
offset: SPOffset::from_u32(self.sp_offset),
193
size: bytes,
194
})
195
}
196
197
fn reserve_stack(&mut self, bytes: u32) -> Result<()> {
198
if bytes == 0 {
199
return Ok(());
200
}
201
202
self.asm
203
.sub_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
204
self.increment_sp(bytes);
205
206
Ok(())
207
}
208
209
fn free_stack(&mut self, bytes: u32) -> Result<()> {
210
if bytes == 0 {
211
return Ok(());
212
}
213
self.asm
214
.add_ir(bytes as i32, writable!(rsp()), OperandSize::S64);
215
self.decrement_sp(bytes);
216
217
Ok(())
218
}
219
220
fn reset_stack_pointer(&mut self, offset: SPOffset) -> Result<()> {
221
self.sp_offset = offset.as_u32();
222
223
Ok(())
224
}
225
226
fn local_address(&mut self, local: &LocalSlot) -> Result<Address> {
227
let (reg, offset) = if local.addressed_from_sp() {
228
let offset = self
229
.sp_offset
230
.checked_sub(local.offset)
231
.ok_or_else(|| CodeGenError::invalid_local_offset())?;
232
(rsp(), offset)
233
} else {
234
(rbp(), local.offset)
235
};
236
237
Ok(Address::offset(reg, offset))
238
}
239
240
fn address_from_sp(&self, offset: SPOffset) -> Result<Self::Address> {
241
Ok(Address::offset(
242
regs::rsp(),
243
self.sp_offset - offset.as_u32(),
244
))
245
}
246
247
fn address_at_sp(&self, offset: SPOffset) -> Result<Self::Address> {
248
Ok(Address::offset(regs::rsp(), offset.as_u32()))
249
}
250
251
fn address_at_vmctx(&self, offset: u32) -> Result<Self::Address> {
252
Ok(Address::offset(vmctx!(Self), offset))
253
}
254
255
fn store_ptr(&mut self, src: Reg, dst: Self::Address) -> Result<()> {
256
self.store(src.into(), dst, self.ptr_size)
257
}
258
259
fn store(&mut self, src: RegImm, dst: Address, size: OperandSize) -> Result<()> {
260
self.store_impl(src, dst, size, TRUSTED_FLAGS)
261
}
262
263
fn wasm_store(&mut self, src: Reg, dst: Self::Address, kind: StoreKind) -> Result<()> {
264
match kind {
265
StoreKind::Operand(size) => {
266
self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
267
}
268
StoreKind::Atomic(size) => {
269
if size == OperandSize::S128 {
270
// TODO: we don't support 128-bit atomic store yet.
271
bail!(CodeGenError::unexpected_operand_size());
272
}
273
// To stay consistent with cranelift, we emit a normal store followed by a mfence,
274
// although, we could probably just emit a xchg.
275
self.store_impl(src.into(), dst, size, UNTRUSTED_FLAGS)?;
276
self.asm.mfence();
277
}
278
StoreKind::VectorLane(LaneSelector { lane, size }) => {
279
self.ensure_has_avx()?;
280
self.asm
281
.xmm_vpextr_rm(&dst, src, lane, size, UNTRUSTED_FLAGS);
282
}
283
}
284
285
Ok(())
286
}
287
288
fn pop(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
289
let current_sp = SPOffset::from_u32(self.sp_offset);
290
let _ = match (dst.to_reg().class(), size) {
291
(RegClass::Int, OperandSize::S32) => {
292
let addr = self.address_from_sp(current_sp)?;
293
self.asm.movzx_mr(
294
&addr,
295
dst,
296
size.extend_to::<Zero>(OperandSize::S64),
297
TRUSTED_FLAGS,
298
);
299
self.free_stack(size.bytes())?;
300
}
301
(RegClass::Int, OperandSize::S64) => {
302
self.asm.pop_r(dst);
303
self.decrement_sp(<Self::ABI as ABI>::word_bytes() as u32);
304
}
305
(RegClass::Float, _) | (RegClass::Vector, _) => {
306
let addr = self.address_from_sp(current_sp)?;
307
self.asm.xmm_mov_mr(&addr, dst, size, TRUSTED_FLAGS);
308
self.free_stack(size.bytes())?;
309
}
310
_ => bail!(CodeGenError::invalid_operand_combination()),
311
};
312
Ok(())
313
}
314
315
fn with_scratch<T: ScratchType, R>(&mut self, f: impl FnOnce(&mut Self, Scratch) -> R) -> R {
316
let r = self
317
.scratch_scope
318
.reg_for_class(T::reg_class(), &mut |_| Ok(()))
319
.expect("Scratch register to be available");
320
321
let ret = f(self, Scratch::new(r));
322
self.scratch_scope.free(r);
323
ret
324
}
325
326
fn call(
327
&mut self,
328
stack_args_size: u32,
329
mut load_callee: impl FnMut(&mut Self) -> Result<(CalleeKind, CallingConvention)>,
330
) -> Result<u32> {
331
let alignment: u32 = <Self::ABI as abi::ABI>::call_stack_align().into();
332
let addend: u32 = <Self::ABI as abi::ABI>::initial_frame_size().into();
333
let delta = calculate_frame_adjustment(self.sp_offset()?.as_u32(), addend, alignment);
334
let aligned_args_size = align_to(stack_args_size, alignment);
335
let total_stack = delta + aligned_args_size;
336
self.reserve_stack(total_stack)?;
337
let (callee, cc) = load_callee(self)?;
338
match callee {
339
CalleeKind::Indirect(reg) => self.asm.call_with_reg(cc, reg),
340
CalleeKind::Direct(idx) => self.asm.call_with_name(cc, idx),
341
};
342
Ok(total_stack)
343
}
344
345
fn load_ptr(&mut self, src: Self::Address, dst: WritableReg) -> Result<()> {
346
self.load(src, dst, self.ptr_size)
347
}
348
349
fn compute_addr(
350
&mut self,
351
src: Self::Address,
352
dst: WritableReg,
353
size: OperandSize,
354
) -> Result<()> {
355
self.asm.lea(&src, dst, size);
356
Ok(())
357
}
358
359
fn load(&mut self, src: Address, dst: WritableReg, size: OperandSize) -> Result<()> {
360
self.load_impl(src, dst, size, TRUSTED_FLAGS)
361
}
362
363
fn wasm_load(&mut self, src: Self::Address, dst: WritableReg, kind: LoadKind) -> Result<()> {
364
let size = kind.derive_operand_size();
365
366
match kind {
367
LoadKind::ScalarExtend(ext) => match ext {
368
ExtendKind::Signed(ext) => {
369
self.asm.movsx_mr(&src, dst, ext, UNTRUSTED_FLAGS);
370
}
371
ExtendKind::Unsigned(_) => self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?,
372
},
373
LoadKind::Operand(_) | LoadKind::Atomic(_, _) => {
374
// The guarantees of the x86-64 memory model ensure that `SeqCst`
375
// loads are equivalent to normal loads.
376
if kind.is_atomic() && size == OperandSize::S128 {
377
bail!(CodeGenError::unexpected_operand_size());
378
}
379
380
self.load_impl(src, dst, size, UNTRUSTED_FLAGS)?;
381
}
382
LoadKind::VectorExtend(ext) => {
383
self.ensure_has_avx()?;
384
self.asm
385
.xmm_vpmov_mr(&src, dst, ext.into(), UNTRUSTED_FLAGS)
386
}
387
LoadKind::Splat(_) => {
388
self.ensure_has_avx()?;
389
390
if size == OperandSize::S64 {
391
self.asm
392
.xmm_mov_mr(&src, dst, OperandSize::S64, UNTRUSTED_FLAGS);
393
self.asm.xmm_vpshuf_rr(
394
dst.to_reg(),
395
dst,
396
Self::vpshuf_mask_for_64_bit_splats(),
397
OperandSize::S32,
398
);
399
} else {
400
self.asm
401
.xmm_vpbroadcast_mr(&src, dst, size, UNTRUSTED_FLAGS);
402
}
403
}
404
LoadKind::VectorLane(LaneSelector { lane, size }) => {
405
self.ensure_has_avx()?;
406
self.with_scratch::<IntScratch, _>(|masm, byte_tmp| {
407
masm.load_impl(src, byte_tmp.writable(), size, UNTRUSTED_FLAGS)?;
408
masm.asm
409
.xmm_vpinsr_rrr(dst, dst.to_reg(), byte_tmp.inner(), lane, size);
410
wasmtime_environ::error::Ok(())
411
})?;
412
}
413
LoadKind::VectorZero(size) => {
414
self.ensure_has_avx()?;
415
self.with_scratch::<IntScratch, _>(|masm, scratch| {
416
masm.load_impl(src, scratch.writable(), size, UNTRUSTED_FLAGS)?;
417
masm.asm.avx_gpr_to_xmm(scratch.inner(), dst, size);
418
wasmtime_environ::error::Ok(())
419
})?;
420
}
421
}
422
423
Ok(())
424
}
425
426
fn sp_offset(&self) -> Result<SPOffset> {
427
Ok(SPOffset::from_u32(self.sp_offset))
428
}
429
430
fn zero(&mut self, reg: WritableReg) -> Result<()> {
431
self.asm.xor_rr(
432
reg.to_reg(),
433
reg,
434
OperandSize::from_bytes(<Self::ABI>::word_bytes()),
435
);
436
Ok(())
437
}
438
439
fn mov(&mut self, dst: WritableReg, src: RegImm, size: OperandSize) -> Result<()> {
440
match (src, dst.to_reg()) {
441
(RegImm::Reg(src), dst_reg) => match (src.class(), dst_reg.class()) {
442
(RegClass::Int, RegClass::Int) => Ok(self.asm.mov_rr(src, dst, size)),
443
(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_mov_rr(src, dst, size)),
444
_ => bail!(CodeGenError::invalid_operand_combination()),
445
},
446
(RegImm::Imm(imm), _) => self.load_constant(&imm, dst, size),
447
}
448
}
449
450
fn cmov(
451
&mut self,
452
dst: WritableReg,
453
src: Reg,
454
cc: IntCmpKind,
455
size: OperandSize,
456
) -> Result<()> {
457
match (src.class(), dst.to_reg().class()) {
458
(RegClass::Int, RegClass::Int) => Ok(self.asm.cmov(src, dst, cc, size)),
459
(RegClass::Float, RegClass::Float) => Ok(self.asm.xmm_cmov(src, dst, cc, size)),
460
_ => Err(format_err!(CodeGenError::invalid_operand_combination())),
461
}
462
}
463
464
fn add(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
465
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
466
match (rhs, dst) {
467
(RegImm::Imm(imm), _) => {
468
if let Some(v) = imm.to_i32() {
469
self.asm.add_ir(v, dst, size);
470
} else {
471
self.with_scratch::<IntScratch, _>(|masm, scratch| {
472
masm.load_constant(&imm, scratch.writable(), size)?;
473
masm.asm.add_rr(scratch.inner(), dst, size);
474
wasmtime_environ::error::Ok(())
475
})?;
476
}
477
}
478
479
(RegImm::Reg(src), dst) => {
480
self.asm.add_rr(src, dst, size);
481
}
482
}
483
484
Ok(())
485
}
486
487
fn checked_uadd(
488
&mut self,
489
dst: WritableReg,
490
lhs: Reg,
491
rhs: RegImm,
492
size: OperandSize,
493
trap: TrapCode,
494
) -> Result<()> {
495
self.add(dst, lhs, rhs, size)?;
496
self.asm.trapif(CC::B, trap);
497
Ok(())
498
}
499
500
fn sub(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
501
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
502
match (rhs, dst) {
503
(RegImm::Imm(imm), reg) => {
504
if let Some(v) = imm.to_i32() {
505
self.asm.sub_ir(v, reg, size);
506
} else {
507
self.with_scratch::<IntScratch, _>(|masm, scratch| {
508
masm.load_constant(&imm, scratch.writable(), size)?;
509
masm.asm.sub_rr(scratch.inner(), reg, size);
510
wasmtime_environ::error::Ok(())
511
})?;
512
}
513
}
514
515
(RegImm::Reg(src), dst) => {
516
self.asm.sub_rr(src, dst, size);
517
}
518
}
519
520
Ok(())
521
}
522
523
fn mul(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
524
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
525
match (rhs, dst) {
526
(RegImm::Imm(imm), _) => {
527
if let Some(v) = imm.to_i32() {
528
self.asm.mul_ir(v, dst, size);
529
} else {
530
self.with_scratch::<IntScratch, _>(|masm, scratch| {
531
masm.load_constant(&imm, scratch.writable(), size)?;
532
masm.asm.mul_rr(scratch.inner(), dst, size);
533
wasmtime_environ::error::Ok(())
534
})?;
535
}
536
}
537
538
(RegImm::Reg(src), dst) => {
539
self.asm.mul_rr(src, dst, size);
540
}
541
}
542
543
Ok(())
544
}
545
546
fn float_add(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
547
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
548
self.asm.xmm_add_rr(rhs, dst, size);
549
Ok(())
550
}
551
552
fn float_sub(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
553
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
554
self.asm.xmm_sub_rr(rhs, dst, size);
555
Ok(())
556
}
557
558
fn float_mul(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
559
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
560
self.asm.xmm_mul_rr(rhs, dst, size);
561
Ok(())
562
}
563
564
fn float_div(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
565
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
566
self.asm.xmm_div_rr(rhs, dst, size);
567
Ok(())
568
}
569
570
fn float_min(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
571
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
572
self.asm.xmm_min_seq(rhs, dst, size);
573
Ok(())
574
}
575
576
fn float_max(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) -> Result<()> {
577
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
578
self.asm.xmm_max_seq(rhs, dst, size);
579
Ok(())
580
}
581
582
fn float_copysign(
583
&mut self,
584
dst: WritableReg,
585
lhs: Reg,
586
rhs: Reg,
587
size: OperandSize,
588
) -> Result<()> {
589
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
590
let sign_mask = match size {
591
OperandSize::S32 => I::I32(0x80000000),
592
OperandSize::S64 => I::I64(0x8000000000000000),
593
OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
594
bail!(CodeGenError::unexpected_operand_size())
595
}
596
};
597
598
self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
599
masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
600
masm.load_constant(&sign_mask, scratch_gpr.writable(), size)?;
601
masm.asm
602
.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
603
604
// Clear everything except sign bit in src.
605
masm.asm
606
.xmm_and_rr(scratch_xmm.inner(), writable!(rhs), size);
607
608
// Clear sign bit in dst using scratch to store result. Then copy the
609
// result back to dst.
610
masm.asm
611
.xmm_andn_rr(dst.to_reg(), scratch_xmm.writable(), size);
612
masm.asm.xmm_mov_rr(scratch_xmm.inner(), dst, size);
613
614
// Copy sign bit from src to dst.
615
masm.asm.xmm_or_rr(rhs, dst, size);
616
Ok(())
617
})
618
})
619
}
620
621
fn float_neg(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
622
debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
623
let mask = match size {
624
OperandSize::S32 => I::I32(0x80000000),
625
OperandSize::S64 => I::I64(0x8000000000000000),
626
OperandSize::S8 | OperandSize::S16 | OperandSize::S128 => {
627
bail!(CodeGenError::unexpected_operand_size())
628
}
629
};
630
self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
631
masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
632
masm.load_constant(&mask, scratch_gpr.writable(), size)?;
633
masm.asm
634
.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
635
masm.asm.xmm_xor_rr(scratch_xmm.inner(), dst, size);
636
Ok(())
637
})
638
})
639
}
640
641
fn float_abs(&mut self, dst: WritableReg, size: OperandSize) -> Result<()> {
642
debug_assert_eq!(dst.to_reg().class(), RegClass::Float);
643
let mask = match size {
644
OperandSize::S32 => I::I32(0x7fffffff),
645
OperandSize::S64 => I::I64(0x7fffffffffffffff),
646
OperandSize::S128 | OperandSize::S16 | OperandSize::S8 => {
647
bail!(CodeGenError::unexpected_operand_size())
648
}
649
};
650
651
self.with_scratch::<IntScratch, _>(|masm, scratch_gpr| {
652
masm.with_scratch::<FloatScratch, _>(|masm, scratch_xmm| {
653
masm.load_constant(&mask, scratch_gpr.writable(), size)?;
654
655
masm.asm
656
.gpr_to_xmm(scratch_gpr.inner(), scratch_xmm.writable(), size);
657
masm.asm.xmm_and_rr(scratch_xmm.inner(), dst, size);
658
Ok(())
659
})
660
})
661
}
662
663
fn float_round<
664
F: FnMut(&mut FuncEnv<Self::Ptr>, &mut CodeGenContext<Emission>, &mut Self) -> Result<()>,
665
>(
666
&mut self,
667
mode: RoundingMode,
668
env: &mut FuncEnv<Self::Ptr>,
669
context: &mut CodeGenContext<Emission>,
670
size: OperandSize,
671
mut fallback: F,
672
) -> Result<()> {
673
if self.flags.has_sse41() {
674
let src = context.pop_to_reg(self, None)?;
675
self.asm
676
.xmm_rounds_rr(src.into(), writable!(src.into()), mode, size);
677
context.stack.push(src.into());
678
Ok(())
679
} else {
680
fallback(env, context, self)
681
}
682
}
683
684
fn float_sqrt(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
685
self.asm.sqrt(src, dst, size);
686
Ok(())
687
}
688
689
fn and(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
690
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
691
match (rhs, dst) {
692
(RegImm::Imm(imm), _) => {
693
if let Some(v) = imm.to_i32() {
694
self.asm.and_ir(v, dst, size);
695
} else {
696
self.with_scratch::<IntScratch, _>(|masm, scratch| {
697
masm.load_constant(&imm, scratch.writable(), size)?;
698
masm.asm.and_rr(scratch.inner(), dst, size);
699
wasmtime_environ::error::Ok(())
700
})?;
701
}
702
}
703
704
(RegImm::Reg(src), dst) => {
705
self.asm.and_rr(src, dst, size);
706
}
707
}
708
709
Ok(())
710
}
711
712
fn or(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
713
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
714
match (rhs, dst) {
715
(RegImm::Imm(imm), _) => {
716
if let Some(v) = imm.to_i32() {
717
self.asm.or_ir(v, dst, size);
718
} else {
719
self.with_scratch::<IntScratch, _>(|masm, scratch| {
720
masm.load_constant(&imm, scratch.writable(), size)?;
721
masm.asm.or_rr(scratch.inner(), dst, size);
722
wasmtime_environ::error::Ok(())
723
})?;
724
}
725
}
726
727
(RegImm::Reg(src), dst) => {
728
self.asm.or_rr(src, dst, size);
729
}
730
}
731
732
Ok(())
733
}
734
735
fn xor(&mut self, dst: WritableReg, lhs: Reg, rhs: RegImm, size: OperandSize) -> Result<()> {
736
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
737
match (rhs, dst) {
738
(RegImm::Imm(imm), _) => {
739
if let Some(v) = imm.to_i32() {
740
self.asm.xor_ir(v, dst, size);
741
} else {
742
self.with_scratch::<IntScratch, _>(|masm, scratch| {
743
masm.load_constant(&imm, scratch.writable(), size)?;
744
masm.asm.xor_rr(scratch.inner(), dst, size);
745
wasmtime_environ::error::Ok(())
746
})?;
747
}
748
}
749
750
(RegImm::Reg(src), _) => {
751
self.asm.xor_rr(src, dst, size);
752
}
753
}
754
755
Ok(())
756
}
757
758
fn shift_ir(
759
&mut self,
760
dst: WritableReg,
761
imm: I,
762
lhs: Reg,
763
kind: ShiftKind,
764
size: OperandSize,
765
) -> Result<()> {
766
Self::ensure_two_argument_form(&dst.to_reg(), &lhs)?;
767
self.asm
768
.shift_ir(imm.unwrap_as_u64() as u8, dst, kind, size);
769
Ok(())
770
}
771
772
fn shift(
773
&mut self,
774
context: &mut CodeGenContext<Emission>,
775
kind: ShiftKind,
776
size: OperandSize,
777
) -> Result<()> {
778
// Number of bits to shift must be in the CL register.
779
let src = context.pop_to_reg(self, Some(regs::rcx()))?;
780
let dst = context.pop_to_reg(self, None)?;
781
782
self.asm
783
.shift_rr(src.into(), writable!(dst.into()), kind, size);
784
785
context.free_reg(src);
786
context.stack.push(dst.into());
787
788
Ok(())
789
}
790
791
fn div(
792
&mut self,
793
context: &mut CodeGenContext<Emission>,
794
kind: DivKind,
795
size: OperandSize,
796
) -> Result<()> {
797
// Allocate rdx:rax.
798
let rdx = context.reg(regs::rdx(), self)?;
799
let rax = context.reg(regs::rax(), self)?;
800
801
// Allocate the divisor, which can be any gpr.
802
let divisor = context.pop_to_reg(self, None)?;
803
804
// Mark rax as allocatable.
805
context.free_reg(rax);
806
// Move the top value to rax.
807
let rax = context.pop_to_reg(self, Some(rax))?;
808
self.asm.div(divisor.into(), (rax.into(), rdx), kind, size);
809
810
// Free the divisor and rdx.
811
context.free_reg(divisor);
812
context.free_reg(rdx);
813
814
// Push the quotient.
815
context.stack.push(rax.into());
816
Ok(())
817
}
818
819
fn rem(
820
&mut self,
821
context: &mut CodeGenContext<Emission>,
822
kind: RemKind,
823
size: OperandSize,
824
) -> Result<()> {
825
// Allocate rdx:rax.
826
let rdx = context.reg(regs::rdx(), self)?;
827
let rax = context.reg(regs::rax(), self)?;
828
829
// Allocate the divisor, which can be any gpr.
830
let divisor = context.pop_to_reg(self, None)?;
831
832
// Mark rax as allocatable.
833
context.free_reg(rax);
834
// Move the top value to rax.
835
let rax = context.pop_to_reg(self, Some(rax))?;
836
self.asm.rem(divisor.reg, (rax.into(), rdx), kind, size);
837
838
// Free the divisor and rax.
839
context.free_reg(divisor);
840
context.free_reg(rax);
841
842
// Push the remainder.
843
context.stack.push(Val::reg(rdx, divisor.ty));
844
845
Ok(())
846
}
847
848
fn frame_restore(&mut self) -> Result<()> {
849
debug_assert_eq!(self.sp_offset, 0);
850
self.asm.pop_r(writable!(rbp()));
851
self.asm.ret();
852
Ok(())
853
}
854
855
fn finalize(mut self, base: Option<SourceLoc>) -> Result<MachBufferFinalized<Final>> {
856
if let Some(patch) = self.stack_max_use_add {
857
patch.finalize(i32::try_from(self.sp_max).unwrap(), self.asm.buffer_mut());
858
}
859
860
Ok(self.asm.finalize(base))
861
}
862
863
fn address_at_reg(&self, reg: Reg, offset: u32) -> Result<Self::Address> {
864
Ok(Address::offset(reg, offset))
865
}
866
867
fn cmp(&mut self, src1: Reg, src2: RegImm, size: OperandSize) -> Result<()> {
868
match src2 {
869
RegImm::Imm(imm) => {
870
if let Some(v) = imm.to_i32() {
871
self.asm.cmp_ir(src1, v, size);
872
} else {
873
self.with_scratch::<IntScratch, _>(|masm, scratch| {
874
masm.load_constant(&imm, scratch.writable(), size)?;
875
masm.asm.cmp_rr(src1, scratch.inner(), size);
876
wasmtime_environ::error::Ok(())
877
})?;
878
}
879
}
880
RegImm::Reg(src2) => {
881
self.asm.cmp_rr(src1, src2, size);
882
}
883
}
884
885
Ok(())
886
}
887
888
fn cmp_with_set(
889
&mut self,
890
dst: WritableReg,
891
src: RegImm,
892
kind: IntCmpKind,
893
size: OperandSize,
894
) -> Result<()> {
895
self.cmp(dst.to_reg(), src, size)?;
896
self.asm.setcc(kind, dst);
897
Ok(())
898
}
899
900
fn float_cmp_with_set(
901
&mut self,
902
dst: WritableReg,
903
src1: Reg,
904
src2: Reg,
905
kind: FloatCmpKind,
906
size: OperandSize,
907
) -> Result<()> {
908
// Float comparisons needs to be ordered (that is, comparing with a NaN
909
// should return 0) except for not equal which needs to be unordered.
910
// We use ucomis{s, d} because comis{s, d} has an undefined result if
911
// either operand is NaN. Since ucomis{s, d} is unordered, we need to
912
// compensate to make the comparison ordered. Ucomis{s, d} sets the
913
// ZF, PF, and CF flags if there is an unordered result.
914
let (src1, src2, set_kind) = match kind {
915
FloatCmpKind::Eq => (src1, src2, IntCmpKind::Eq),
916
FloatCmpKind::Ne => (src1, src2, IntCmpKind::Ne),
917
FloatCmpKind::Gt => (src1, src2, IntCmpKind::GtU),
918
FloatCmpKind::Ge => (src1, src2, IntCmpKind::GeU),
919
// Reversing the operands and using the complementary comparison
920
// avoids needing to perform an additional SETNP and AND
921
// instruction.
922
// SETNB and SETNBE check if the carry flag is unset (i.e., not
923
// less than and not unordered) so we get the intended result
924
// without having to look at the parity flag.
925
FloatCmpKind::Lt => (src2, src1, IntCmpKind::GtU),
926
FloatCmpKind::Le => (src2, src1, IntCmpKind::GeU),
927
};
928
self.asm.ucomis(src1, src2, size);
929
self.asm.setcc(set_kind, dst);
930
let _ = match kind {
931
FloatCmpKind::Eq | FloatCmpKind::Gt | FloatCmpKind::Ge => {
932
// Return false if either operand is NaN by ensuring PF is
933
// unset.
934
self.with_scratch::<IntScratch, _>(|masm, scratch| {
935
masm.asm.setnp(scratch.writable());
936
masm.asm.and_rr(scratch.inner(), dst, size);
937
});
938
}
939
FloatCmpKind::Ne => {
940
// Return true if either operand is NaN by checking if PF is
941
// set.
942
self.with_scratch::<IntScratch, _>(|masm, scratch| {
943
masm.asm.setp(scratch.writable());
944
masm.asm.or_rr(scratch.inner(), dst, size);
945
});
946
}
947
FloatCmpKind::Lt | FloatCmpKind::Le => (),
948
};
949
Ok(())
950
}
951
952
fn clz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
953
if self.flags.has_lzcnt() {
954
self.asm.lzcnt(src, dst, size);
955
} else {
956
self.with_scratch::<IntScratch, _>(|masm, scratch| {
957
// Use the following approach:
958
// dst = size.num_bits() - bsr(src) - is_not_zero
959
// = size.num.bits() + -bsr(src) - is_not_zero.
960
masm.asm.bsr(src, dst, size);
961
masm.asm.setcc(IntCmpKind::Ne, scratch.writable());
962
masm.asm.neg(dst.to_reg(), dst, size);
963
masm.asm.add_ir(size.num_bits() as i32, dst, size);
964
masm.asm.sub_rr(scratch.inner(), dst, size);
965
});
966
}
967
968
Ok(())
969
}
970
971
fn ctz(&mut self, dst: WritableReg, src: Reg, size: OperandSize) -> Result<()> {
972
if self.flags.has_bmi1() {
973
self.asm.tzcnt(src, dst, size);
974
} else {
975
self.with_scratch::<IntScratch, _>(|masm, scratch| {
976
// Use the following approach:
977
// dst = bsf(src) + (is_zero * size.num_bits())
978
// = bsf(src) + (is_zero << size.log2()).
979
// BSF outputs the correct value for every value except 0.
980
// When the value is 0, BSF outputs 0, correct output for ctz is
981
// the number of bits.
982
masm.asm.bsf(src, dst, size);
983
masm.asm.setcc(IntCmpKind::Eq, scratch.writable());
984
masm.asm
985
.shift_ir(size.log2(), scratch.writable(), ShiftKind::Shl, size);
986
masm.asm.add_rr(scratch.inner(), dst, size);
987
});
988
}
989
990
Ok(())
991
}
992
993
fn get_label(&mut self) -> Result<MachLabel> {
994
let buffer = self.asm.buffer_mut();
995
Ok(buffer.get_label())
996
}
997
998
fn bind(&mut self, label: MachLabel) -> Result<()> {
999
let buffer = self.asm.buffer_mut();
1000
buffer.bind_label(label, &mut Default::default());
1001
Ok(())
1002
}
1003
1004
fn branch(
1005
&mut self,
1006
kind: IntCmpKind,
1007
lhs: Reg,
1008
rhs: RegImm,
1009
taken: MachLabel,
1010
size: OperandSize,
1011
) -> Result<()> {
1012
use IntCmpKind::*;
1013
1014
match &(lhs, rhs) {
1015
(rlhs, RegImm::Reg(rrhs)) => {
1016
// If the comparison kind is zero or not zero and both operands
1017
// are the same register, emit a test instruction. Else we emit
1018
// a normal comparison.
1019
if (kind == Eq || kind == Ne) && (rlhs == rrhs) {
1020
self.asm.test_rr(*rlhs, *rrhs, size);
1021
} else {
1022
self.cmp(lhs, rhs, size)?;
1023
}
1024
}
1025
_ => self.cmp(lhs, rhs, size)?,
1026
}
1027
self.asm.jmp_if(kind, taken);
1028
Ok(())
1029
}
1030
1031
fn jmp(&mut self, target: MachLabel) -> Result<()> {
1032
self.asm.jmp(target);
1033
Ok(())
1034
}
1035
1036
fn popcnt(&mut self, context: &mut CodeGenContext<Emission>, size: OperandSize) -> Result<()> {
1037
let src = context.pop_to_reg(self, None)?;
1038
if self.flags.has_popcnt() && self.flags.has_sse42() {
1039
self.asm.popcnt(src.into(), writable!(src.into()), size);
1040
context.stack.push(src.into());
1041
Ok(())
1042
} else {
1043
// The fallback functionality here is based on `MacroAssembler::popcnt64` in:
1044
// https://searchfox.org/mozilla-central/source/js/src/jit/x64/MacroAssembler-x64-inl.h#495
1045
1046
let tmp = writable!(context.any_gpr(self)?);
1047
let dst = writable!(src.into());
1048
let (masks, shift_amt) = match size {
1049
OperandSize::S64 => (
1050
[
1051
0x5555555555555555, // m1
1052
0x3333333333333333, // m2
1053
0x0f0f0f0f0f0f0f0f, // m4
1054
0x0101010101010101, // h01
1055
],
1056
56u8,
1057
),
1058
// 32-bit popcount is the same, except the masks are half as
1059
// wide and we shift by 24 at the end rather than 56
1060
OperandSize::S32 => (
1061
[0x55555555i64, 0x33333333i64, 0x0f0f0f0fi64, 0x01010101i64],
1062
24u8,
1063
),
1064
_ => bail!(CodeGenError::unexpected_operand_size()),
1065
};
1066
self.asm.mov_rr(src.into(), tmp, size);
1067
1068
// x -= (x >> 1) & m1;
1069
self.asm.shift_ir(1u8, dst, ShiftKind::ShrU, size);
1070
let lhs = dst.to_reg();
1071
self.and(writable!(lhs), lhs, RegImm::i64(masks[0]), size)?;
1072
self.asm.sub_rr(dst.to_reg(), tmp, size);
1073
1074
// x = (x & m2) + ((x >> 2) & m2);
1075
self.asm.mov_rr(tmp.to_reg(), dst, size);
1076
// Load `0x3333...` into the scratch reg once, allowing us to use
1077
// `and_rr` and avoid inadvertently loading it twice as with `and`
1078
1079
self.with_scratch::<IntScratch, _>(|masm, scratch| {
1080
masm.load_constant(&I::i64(masks[1]), scratch.writable(), size)?;
1081
masm.asm.and_rr(scratch.inner(), dst, size);
1082
masm.asm.shift_ir(2u8, tmp, ShiftKind::ShrU, size);
1083
masm.asm.and_rr(scratch.inner(), tmp, size);
1084
wasmtime_environ::error::Ok(())
1085
})?;
1086
self.asm.add_rr(dst.to_reg(), tmp, size);
1087
1088
// x = (x + (x >> 4)) & m4;
1089
self.asm.mov_rr(tmp.to_reg(), dst, size);
1090
self.asm.shift_ir(4u8, dst, ShiftKind::ShrU, size);
1091
self.asm.add_rr(tmp.to_reg(), dst, size);
1092
let lhs = dst.to_reg();
1093
self.and(writable!(lhs), lhs, RegImm::i64(masks[2]), size)?;
1094
1095
// (x * h01) >> shift_amt
1096
let lhs = dst.to_reg();
1097
self.mul(writable!(lhs), lhs, RegImm::i64(masks[3]), size)?;
1098
self.asm.shift_ir(shift_amt, dst, ShiftKind::ShrU, size);
1099
1100
context.stack.push(src.into());
1101
context.free_reg(tmp.to_reg());
1102
1103
Ok(())
1104
}
1105
}
1106
1107
fn wrap(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1108
self.asm.mov_rr(src, dst, OperandSize::S32);
1109
Ok(())
1110
}
1111
1112
fn extend(&mut self, dst: WritableReg, src: Reg, kind: ExtendKind) -> Result<()> {
1113
match kind {
1114
ExtendKind::Signed(ext) => {
1115
self.asm.movsx_rr(src, dst, ext);
1116
}
1117
ExtendKind::Unsigned(ext) => {
1118
self.asm.movzx_rr(src, dst, ext);
1119
}
1120
}
1121
1122
Ok(())
1123
}
1124
1125
fn signed_truncate(
1126
&mut self,
1127
dst: WritableReg,
1128
src: Reg,
1129
src_size: OperandSize,
1130
dst_size: OperandSize,
1131
kind: TruncKind,
1132
) -> Result<()> {
1133
self.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1134
masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1135
masm.asm.cvt_float_to_sint_seq(
1136
src,
1137
dst,
1138
gpr_scratch.inner(),
1139
xmm_scratch.inner(),
1140
src_size,
1141
dst_size,
1142
kind.is_checked(),
1143
);
1144
Ok(())
1145
})
1146
})
1147
}
1148
1149
fn unsigned_truncate(
1150
&mut self,
1151
ctx: &mut CodeGenContext<Emission>,
1152
src_size: OperandSize,
1153
dst_size: OperandSize,
1154
kind: TruncKind,
1155
) -> Result<()> {
1156
let dst_ty = match dst_size {
1157
OperandSize::S32 => WasmValType::I32,
1158
OperandSize::S64 => WasmValType::I64,
1159
_ => bail!(CodeGenError::unexpected_operand_size()),
1160
};
1161
1162
ctx.convert_op_with_tmp_reg(
1163
self,
1164
dst_ty,
1165
RegClass::Float,
1166
|masm, dst, src, tmp_fpr, dst_size| {
1167
masm.with_scratch::<IntScratch, _>(|masm, gpr_scratch| {
1168
masm.with_scratch::<FloatScratch, _>(|masm, xmm_scratch| {
1169
masm.asm.cvt_float_to_uint_seq(
1170
src,
1171
writable!(dst),
1172
gpr_scratch.inner(),
1173
xmm_scratch.inner(),
1174
tmp_fpr,
1175
src_size,
1176
dst_size,
1177
kind.is_checked(),
1178
);
1179
Ok(())
1180
})
1181
})
1182
},
1183
)
1184
}
1185
1186
fn signed_convert(
1187
&mut self,
1188
dst: WritableReg,
1189
src: Reg,
1190
src_size: OperandSize,
1191
dst_size: OperandSize,
1192
) -> Result<()> {
1193
self.asm.cvt_sint_to_float(src, dst, src_size, dst_size);
1194
Ok(())
1195
}
1196
1197
fn unsigned_convert(
1198
&mut self,
1199
dst: WritableReg,
1200
src: Reg,
1201
tmp_gpr: Reg,
1202
src_size: OperandSize,
1203
dst_size: OperandSize,
1204
) -> Result<()> {
1205
// Need to convert unsigned uint32 to uint64 for conversion instruction sequence.
1206
if let OperandSize::S32 = src_size {
1207
self.extend(
1208
writable!(src),
1209
src,
1210
ExtendKind::Unsigned(Extend::I64Extend32),
1211
)?;
1212
}
1213
1214
self.with_scratch::<IntScratch, _>(|masm, scratch| {
1215
masm.asm
1216
.cvt_uint64_to_float_seq(src, dst, scratch.inner(), tmp_gpr, dst_size);
1217
Ok(())
1218
})
1219
}
1220
1221
fn reinterpret_float_as_int(
1222
&mut self,
1223
dst: WritableReg,
1224
src: Reg,
1225
size: OperandSize,
1226
) -> Result<()> {
1227
self.asm.xmm_to_gpr(src, dst, size);
1228
Ok(())
1229
}
1230
1231
fn reinterpret_int_as_float(
1232
&mut self,
1233
dst: WritableReg,
1234
src: Reg,
1235
size: OperandSize,
1236
) -> Result<()> {
1237
self.asm.gpr_to_xmm(src, dst, size);
1238
Ok(())
1239
}
1240
1241
fn demote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1242
self.asm
1243
.cvt_float_to_float(src, dst, OperandSize::S64, OperandSize::S32);
1244
Ok(())
1245
}
1246
1247
fn promote(&mut self, dst: WritableReg, src: Reg) -> Result<()> {
1248
self.asm
1249
.cvt_float_to_float(src, dst, OperandSize::S32, OperandSize::S64);
1250
Ok(())
1251
}
1252
1253
fn unreachable(&mut self) -> Result<()> {
1254
self.asm.trap(TRAP_UNREACHABLE);
1255
Ok(())
1256
}
1257
1258
fn trap(&mut self, code: TrapCode) -> Result<()> {
1259
self.asm.trap(code);
1260
Ok(())
1261
}
1262
1263
fn trapif(&mut self, cc: IntCmpKind, code: TrapCode) -> Result<()> {
1264
self.asm.trapif(cc, code);
1265
Ok(())
1266
}
1267
1268
fn trapz(&mut self, src: Reg, code: TrapCode) -> Result<()> {
1269
self.asm.test_rr(src, src, self.ptr_size);
1270
self.asm.trapif(IntCmpKind::Eq, code);
1271
Ok(())
1272
}
1273
1274
fn jmp_table(&mut self, targets: &[MachLabel], index: Reg, tmp: Reg) -> Result<()> {
1275
// At least one default target.
1276
debug_assert!(targets.len() >= 1);
1277
let default_index = targets.len() - 1;
1278
// Emit bounds check, by conditionally moving the max cases
1279
// into the given index reg if the contents of the index reg
1280
// are greater.
1281
let max = default_index;
1282
let size = OperandSize::S32;
1283
self.asm.mov_ir(max as u64, writable!(tmp), size);
1284
self.asm.cmp_rr(tmp, index, size);
1285
self.asm.cmov(tmp, writable!(index), IntCmpKind::LtU, size);
1286
1287
let default = targets[default_index];
1288
let rest = &targets[0..default_index];
1289
1290
self.with_scratch::<IntScratch, _>(|masm, tmp1| {
1291
masm.asm
1292
.jmp_table(rest.into(), default, index, tmp1.inner(), tmp);
1293
Ok(())
1294
})
1295
}
1296
1297
fn start_source_loc(&mut self, loc: RelSourceLoc) -> Result<(CodeOffset, RelSourceLoc)> {
1298
Ok(self.asm.buffer_mut().start_srcloc(loc))
1299
}
1300
1301
fn end_source_loc(&mut self) -> Result<()> {
1302
self.asm.buffer_mut().end_srcloc();
1303
Ok(())
1304
}
1305
1306
fn current_code_offset(&self) -> Result<CodeOffset> {
1307
Ok(self.asm.buffer().cur_offset())
1308
}
1309
1310
fn add128(
1311
&mut self,
1312
dst_lo: WritableReg,
1313
dst_hi: WritableReg,
1314
lhs_lo: Reg,
1315
lhs_hi: Reg,
1316
rhs_lo: Reg,
1317
rhs_hi: Reg,
1318
) -> Result<()> {
1319
Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1320
Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1321
self.asm.add_rr(rhs_lo, dst_lo, OperandSize::S64);
1322
self.asm.adc_rr(rhs_hi, dst_hi, OperandSize::S64);
1323
Ok(())
1324
}
1325
1326
fn sub128(
1327
&mut self,
1328
dst_lo: WritableReg,
1329
dst_hi: WritableReg,
1330
lhs_lo: Reg,
1331
lhs_hi: Reg,
1332
rhs_lo: Reg,
1333
rhs_hi: Reg,
1334
) -> Result<()> {
1335
Self::ensure_two_argument_form(&dst_lo.to_reg(), &lhs_lo)?;
1336
Self::ensure_two_argument_form(&dst_hi.to_reg(), &lhs_hi)?;
1337
self.asm.sub_rr(rhs_lo, dst_lo, OperandSize::S64);
1338
self.asm.sbb_rr(rhs_hi, dst_hi, OperandSize::S64);
1339
Ok(())
1340
}
1341
1342
fn mul_wide(
1343
&mut self,
1344
context: &mut CodeGenContext<Emission>,
1345
kind: MulWideKind,
1346
) -> Result<()> {
1347
// Reserve rax/rdx since they're required by the `mul_wide` instruction
1348
// being used here.
1349
let rax = context.reg(regs::rax(), self)?;
1350
let rdx = context.reg(regs::rdx(), self)?;
1351
1352
// The rhs of this binop can be in any register
1353
let rhs = context.pop_to_reg(self, None)?;
1354
// Mark rax as allocatable. and then force the lhs operand to be placed
1355
// in `rax`.
1356
context.free_reg(rax);
1357
let lhs = context.pop_to_reg(self, Some(rax))?;
1358
1359
self.asm.mul_wide(
1360
writable!(rax),
1361
writable!(rdx),
1362
lhs.reg,
1363
rhs.reg,
1364
kind,
1365
OperandSize::S64,
1366
);
1367
1368
// No longer using the rhs register after the multiplication has been
1369
// executed.
1370
context.free_reg(rhs);
1371
1372
// The low bits of the result are in rax, where `lhs` was allocated to
1373
context.stack.push(lhs.into());
1374
// The high bits of the result are in rdx, which we previously reserved.
1375
context.stack.push(Val::Reg(TypedReg::i64(rdx)));
1376
1377
Ok(())
1378
}
1379
1380
fn splat(&mut self, context: &mut CodeGenContext<Emission>, size: SplatKind) -> Result<()> {
1381
// Get the source and destination operands set up first.
1382
let (src, dst) = match size {
1383
// Floats can use the same register for `src` and `dst`.
1384
SplatKind::F32x4 | SplatKind::F64x2 => {
1385
let reg = context.pop_to_reg(self, None)?.reg;
1386
(RegImm::reg(reg), writable!(reg))
1387
}
1388
// For ints, we need to load the operand into a vector register if
1389
// it's not a constant.
1390
SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 | SplatKind::I64x2 => {
1391
let dst = writable!(context.any_fpr(self)?);
1392
let src = if size == SplatKind::I64x2 {
1393
context.pop_i64_const().map(RegImm::i64)
1394
} else {
1395
context.pop_i32_const().map(RegImm::i32)
1396
}
1397
.map_or_else(
1398
|| -> Result<RegImm> {
1399
let reg = context.pop_to_reg(self, None)?.reg;
1400
self.reinterpret_int_as_float(
1401
dst,
1402
reg,
1403
match size {
1404
SplatKind::I8x16 | SplatKind::I16x8 | SplatKind::I32x4 => {
1405
OperandSize::S32
1406
}
1407
SplatKind::I64x2 => OperandSize::S64,
1408
SplatKind::F32x4 | SplatKind::F64x2 => unreachable!(),
1409
},
1410
)?;
1411
context.free_reg(reg);
1412
Ok(RegImm::Reg(dst.to_reg()))
1413
},
1414
Ok,
1415
)?;
1416
(src, dst)
1417
}
1418
};
1419
1420
// Perform the splat on the operands.
1421
if size == SplatKind::I64x2 || size == SplatKind::F64x2 {
1422
self.ensure_has_avx()?;
1423
let mask = Self::vpshuf_mask_for_64_bit_splats();
1424
match src {
1425
RegImm::Reg(src) => self.asm.xmm_vpshuf_rr(src, dst, mask, OperandSize::S32),
1426
RegImm::Imm(imm) => {
1427
let src = self.asm.add_constant(&imm.to_bytes());
1428
self.asm
1429
.xmm_vpshuf_mr(&src, dst, mask, OperandSize::S32, MemFlags::trusted());
1430
}
1431
}
1432
} else {
1433
self.ensure_has_avx2()?;
1434
1435
match src {
1436
RegImm::Reg(src) => self.asm.xmm_vpbroadcast_rr(src, dst, size.lane_size()),
1437
RegImm::Imm(imm) => {
1438
let src = self.asm.add_constant(&imm.to_bytes());
1439
self.asm
1440
.xmm_vpbroadcast_mr(&src, dst, size.lane_size(), MemFlags::trusted());
1441
}
1442
}
1443
}
1444
1445
context
1446
.stack
1447
.push(Val::reg(dst.to_reg(), WasmValType::V128));
1448
Ok(())
1449
}
1450
1451
fn shuffle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, lanes: [u8; 16]) -> Result<()> {
1452
self.ensure_has_avx()?;
1453
1454
// Use `vpshufb` with `lanes` to set the lanes in `lhs` and `rhs`
1455
// separately to either the selected index or 0.
1456
// Then use `vpor` to combine `lhs` and `rhs` into `dst`.
1457
// Setting the most significant bit in the mask's lane to 1 will
1458
// result in corresponding lane in the destination register being
1459
// set to 0. 0x80 sets the most significant bit to 1.
1460
let mut mask_lhs: [u8; 16] = [0x80; 16];
1461
let mut mask_rhs: [u8; 16] = [0x80; 16];
1462
for i in 0..lanes.len() {
1463
if lanes[i] < 16 {
1464
mask_lhs[i] = lanes[i];
1465
} else {
1466
mask_rhs[i] = lanes[i] - 16;
1467
}
1468
}
1469
let mask_lhs = self.asm.add_constant(&mask_lhs);
1470
let mask_rhs = self.asm.add_constant(&mask_rhs);
1471
1472
self.asm.xmm_vpshufb_rrm(dst, lhs, &mask_lhs);
1473
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1474
masm.asm.xmm_vpshufb_rrm(scratch.writable(), rhs, &mask_rhs);
1475
masm.asm.xmm_vpor_rrr(dst, dst.to_reg(), scratch.inner());
1476
Ok(())
1477
})
1478
}
1479
1480
fn swizzle(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg) -> Result<()> {
1481
self.ensure_has_avx()?;
1482
1483
// Clamp rhs to [0, 15 (i.e., 0xF)] and substitute 0 for anything
1484
// outside that range.
1485
// Each lane is a signed byte so the maximum value is 0x7F. Adding
1486
// 0x70 to any value higher than 0xF will saturate resulting in a value
1487
// of 0xFF (i.e., 0).
1488
let clamp = self.asm.add_constant(&[0x70; 16]);
1489
self.asm
1490
.xmm_vpaddus_rrm(writable!(rhs), rhs, &clamp, OperandSize::S8);
1491
1492
// Don't need to subtract 0x70 since `vpshufb` uses the least
1493
// significant 4 bits which are the same after adding 0x70.
1494
self.asm.xmm_vpshufb_rrr(dst, lhs, rhs);
1495
Ok(())
1496
}
1497
1498
fn atomic_rmw(
1499
&mut self,
1500
context: &mut CodeGenContext<Emission>,
1501
addr: Self::Address,
1502
size: OperandSize,
1503
op: RmwOp,
1504
flags: MemFlags,
1505
extend: Option<Extend<Zero>>,
1506
) -> Result<()> {
1507
let res = match op {
1508
RmwOp::Add => {
1509
let operand = context.pop_to_reg(self, None)?;
1510
self.asm
1511
.lock_xadd(addr, writable!(operand.reg), size, flags);
1512
operand.reg
1513
}
1514
RmwOp::Sub => {
1515
let operand = context.pop_to_reg(self, None)?;
1516
self.asm.neg(operand.reg, writable!(operand.reg), size);
1517
self.asm
1518
.lock_xadd(addr, writable!(operand.reg), size, flags);
1519
operand.reg
1520
}
1521
RmwOp::Xchg => {
1522
let operand = context.pop_to_reg(self, None)?;
1523
self.asm.xchg(addr, writable!(operand.reg), size, flags);
1524
operand.reg
1525
}
1526
RmwOp::And | RmwOp::Or | RmwOp::Xor => {
1527
let op = match op {
1528
RmwOp::And => AtomicRmwSeqOp::And,
1529
RmwOp::Or => AtomicRmwSeqOp::Or,
1530
RmwOp::Xor => AtomicRmwSeqOp::Xor,
1531
_ => unreachable!(
1532
"invalid op for atomic_rmw_seq, should be one of `or`, `and` or `xor`"
1533
),
1534
};
1535
let dst = context.reg(regs::rax(), self)?;
1536
let operand = context.pop_to_reg(self, None)?;
1537
1538
self.with_scratch::<IntScratch, _>(|masm, scratch| {
1539
masm.asm.atomic_rmw_seq(
1540
addr,
1541
operand.reg,
1542
writable!(dst),
1543
scratch.writable(),
1544
size,
1545
flags,
1546
op,
1547
);
1548
});
1549
1550
context.free_reg(operand.reg);
1551
dst
1552
}
1553
};
1554
1555
let dst_ty = match extend {
1556
Some(ext) => {
1557
// We don't need to zero-extend from 32 to 64bits.
1558
if !(ext.from_bits() == 32 && ext.to_bits() == 64) {
1559
self.asm.movzx_rr(res, writable!(res), ext);
1560
}
1561
1562
WasmValType::int_from_bits(ext.to_bits())
1563
}
1564
None => WasmValType::int_from_bits(size.num_bits()),
1565
};
1566
1567
context.stack.push(TypedReg::new(dst_ty, res).into());
1568
1569
Ok(())
1570
}
1571
1572
fn extract_lane(
1573
&mut self,
1574
src: Reg,
1575
dst: WritableReg,
1576
lane: u8,
1577
kind: ExtractLaneKind,
1578
) -> Result<()> {
1579
self.ensure_has_avx()?;
1580
1581
match kind {
1582
ExtractLaneKind::I8x16S
1583
| ExtractLaneKind::I8x16U
1584
| ExtractLaneKind::I16x8S
1585
| ExtractLaneKind::I16x8U
1586
| ExtractLaneKind::I32x4
1587
| ExtractLaneKind::I64x2 => self.asm.xmm_vpextr_rr(dst, src, lane, kind.lane_size()),
1588
ExtractLaneKind::F32x4 | ExtractLaneKind::F64x2 if lane == 0 => {
1589
// If the `src` and `dst` registers are the same, then the
1590
// appropriate value is already in the correct position in
1591
// the register.
1592
assert!(src == dst.to_reg());
1593
}
1594
ExtractLaneKind::F32x4 => self.asm.xmm_vpshuf_rr(src, dst, lane, kind.lane_size()),
1595
ExtractLaneKind::F64x2 => {
1596
// `0b11_10` selects the high and low 32-bits of the second
1597
// 64-bit, so `0b11_10_11_10` splats the 64-bit value across
1598
// both lanes. Since we put an `f64` on the stack, we use
1599
// the splatted value.
1600
// Double-check `lane == 0` was handled in another branch.
1601
assert!(lane == 1);
1602
self.asm
1603
.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, OperandSize::S32)
1604
}
1605
}
1606
1607
// Sign-extend to 32-bits for sign extended kinds.
1608
match kind {
1609
ExtractLaneKind::I8x16S | ExtractLaneKind::I16x8S => {
1610
self.asm.movsx_rr(dst.to_reg(), dst, kind.into())
1611
}
1612
_ => (),
1613
}
1614
1615
Ok(())
1616
}
1617
1618
fn replace_lane(
1619
&mut self,
1620
src: RegImm,
1621
dst: WritableReg,
1622
lane: u8,
1623
kind: ReplaceLaneKind,
1624
) -> Result<()> {
1625
self.ensure_has_avx()?;
1626
1627
match kind {
1628
ReplaceLaneKind::I8x16
1629
| ReplaceLaneKind::I16x8
1630
| ReplaceLaneKind::I32x4
1631
| ReplaceLaneKind::I64x2 => match src {
1632
RegImm::Reg(reg) => {
1633
self.asm
1634
.xmm_vpinsr_rrr(dst, dst.to_reg(), reg, lane, kind.lane_size());
1635
}
1636
RegImm::Imm(imm) => {
1637
let address = self.asm.add_constant(&imm.to_bytes());
1638
self.asm
1639
.xmm_vpinsr_rrm(dst, dst.to_reg(), &address, lane, kind.lane_size());
1640
}
1641
},
1642
ReplaceLaneKind::F32x4 => {
1643
// Immediate for `vinsertps` uses first 3 bits to determine
1644
// which elements of the destination to set to 0. The next 2
1645
// bits specify which element of the destination will be
1646
// overwritten.
1647
let imm = lane << 4;
1648
match src {
1649
RegImm::Reg(reg) => self.asm.xmm_vinsertps_rrr(dst, dst.to_reg(), reg, imm),
1650
RegImm::Imm(val) => {
1651
let address = self.asm.add_constant(&val.to_bytes());
1652
self.asm.xmm_vinsertps_rrm(dst, dst.to_reg(), &address, imm);
1653
}
1654
}
1655
}
1656
ReplaceLaneKind::F64x2 => match src {
1657
RegImm::Reg(reg) => match lane {
1658
0 => self.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), reg),
1659
1 => self.asm.xmm_vmovlhps_rrr(dst, dst.to_reg(), reg),
1660
_ => unreachable!(),
1661
},
1662
RegImm::Imm(imm) => {
1663
let address = self.asm.add_constant(&imm.to_bytes());
1664
match lane {
1665
0 => {
1666
// Memory load variant of `vmovsd` zeroes the upper
1667
// 64 bits of the register so need to load the
1668
// immediate to a register to use the register
1669
// variant of `vmovsd` to perform the merge.
1670
1671
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1672
masm.asm.xmm_vmovsd_rm(scratch.writable(), &address);
1673
masm.asm.xmm_vmovsd_rrr(dst, dst.to_reg(), scratch.inner());
1674
});
1675
}
1676
1 => self.asm.xmm_vmovlhps_rrm(dst, dst.to_reg(), &address),
1677
_ => unreachable!(),
1678
}
1679
}
1680
},
1681
}
1682
Ok(())
1683
}
1684
1685
fn atomic_cas(
1686
&mut self,
1687
context: &mut CodeGenContext<Emission>,
1688
addr: Self::Address,
1689
size: OperandSize,
1690
flags: MemFlags,
1691
extend: Option<Extend<Zero>>,
1692
) -> Result<()> {
1693
// `cmpxchg` expects `expected` to be in the `*a*` register.
1694
// reserve rax for the expected argument.
1695
let rax = context.reg(regs::rax(), self)?;
1696
1697
let replacement = context.pop_to_reg(self, None)?;
1698
1699
// mark `rax` as allocatable again.
1700
context.free_reg(rax);
1701
let expected = context.pop_to_reg(self, Some(regs::rax()))?;
1702
1703
self.asm
1704
.cmpxchg(addr, replacement.reg, writable!(expected.reg), size, flags);
1705
1706
if let Some(extend) = extend {
1707
// We don't need to zero-extend from 32 to 64bits.
1708
if !(extend.from_bits() == 32 && extend.to_bits() == 64) {
1709
self.asm
1710
.movzx_rr(expected.reg, writable!(expected.reg), extend);
1711
}
1712
}
1713
1714
context.stack.push(expected.into());
1715
context.free_reg(replacement);
1716
1717
Ok(())
1718
}
1719
1720
fn v128_eq(
1721
&mut self,
1722
dst: WritableReg,
1723
lhs: Reg,
1724
rhs: Reg,
1725
kind: VectorEqualityKind,
1726
) -> Result<()> {
1727
self.ensure_has_avx()?;
1728
1729
match kind {
1730
VectorEqualityKind::I8x16
1731
| VectorEqualityKind::I16x8
1732
| VectorEqualityKind::I32x4
1733
| VectorEqualityKind::I64x2 => {
1734
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size())
1735
}
1736
VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1737
self.asm
1738
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Eq)
1739
}
1740
}
1741
Ok(())
1742
}
1743
1744
fn v128_ne(
1745
&mut self,
1746
dst: WritableReg,
1747
lhs: Reg,
1748
rhs: Reg,
1749
kind: VectorEqualityKind,
1750
) -> Result<()> {
1751
self.ensure_has_avx()?;
1752
1753
match kind {
1754
VectorEqualityKind::I8x16
1755
| VectorEqualityKind::I16x8
1756
| VectorEqualityKind::I32x4
1757
| VectorEqualityKind::I64x2 => {
1758
// Check for equality and invert the results.
1759
self.asm
1760
.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1761
self.asm
1762
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1763
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1764
}
1765
VectorEqualityKind::F32x4 | VectorEqualityKind::F64x2 => {
1766
self.asm
1767
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Ne)
1768
}
1769
}
1770
Ok(())
1771
}
1772
1773
fn v128_lt(
1774
&mut self,
1775
dst: WritableReg,
1776
lhs: Reg,
1777
rhs: Reg,
1778
kind: VectorCompareKind,
1779
) -> Result<()> {
1780
self.ensure_has_avx()?;
1781
1782
match kind {
1783
VectorCompareKind::I8x16S
1784
| VectorCompareKind::I16x8S
1785
| VectorCompareKind::I32x4S
1786
| VectorCompareKind::I64x2S => {
1787
// Perform a greater than check with reversed parameters.
1788
self.asm.xmm_vpcmpgt_rrr(dst, rhs, lhs, kind.lane_size())
1789
}
1790
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1791
// Set `lhs` to min values, check for equality, then invert the
1792
// result.
1793
// If `lhs` is smaller, then equality check will fail and result
1794
// will be inverted to true. Otherwise the equality check will
1795
// pass and be inverted to false.
1796
self.asm
1797
.xmm_vpminu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1798
self.asm
1799
.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1800
self.asm
1801
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1802
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1803
}
1804
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1805
self.asm
1806
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Lt)
1807
}
1808
}
1809
Ok(())
1810
}
1811
1812
fn v128_le(
1813
&mut self,
1814
dst: WritableReg,
1815
lhs: Reg,
1816
rhs: Reg,
1817
kind: VectorCompareKind,
1818
) -> Result<()> {
1819
self.ensure_has_avx()?;
1820
1821
match kind {
1822
VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1823
// Set the `rhs` vector to the signed minimum values and then
1824
// compare them with `lhs` for equality.
1825
self.asm
1826
.xmm_vpmins_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1827
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1828
}
1829
VectorCompareKind::I64x2S => {
1830
// Do a greater than check and invert the results.
1831
self.asm
1832
.xmm_vpcmpgt_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1833
self.asm
1834
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1835
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1836
}
1837
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1838
// Set the `rhs` vector to the signed minimum values and then
1839
// compare them with `lhs` for equality.
1840
self.asm
1841
.xmm_vpminu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1842
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1843
}
1844
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1845
self.asm
1846
.xmm_vcmpp_rrr(dst, lhs, rhs, kind.lane_size(), VcmpKind::Le)
1847
}
1848
}
1849
Ok(())
1850
}
1851
1852
fn v128_gt(
1853
&mut self,
1854
dst: WritableReg,
1855
lhs: Reg,
1856
rhs: Reg,
1857
kind: VectorCompareKind,
1858
) -> Result<()> {
1859
self.ensure_has_avx()?;
1860
1861
match kind {
1862
VectorCompareKind::I8x16S
1863
| VectorCompareKind::I16x8S
1864
| VectorCompareKind::I32x4S
1865
| VectorCompareKind::I64x2S => {
1866
self.asm.xmm_vpcmpgt_rrr(dst, lhs, rhs, kind.lane_size())
1867
}
1868
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1869
// Set `lhs` to max values, check for equality, then invert the
1870
// result.
1871
// If `lhs` is larger, then equality check will fail and result
1872
// will be inverted to true. Otherwise the equality check will
1873
// pass and be inverted to false.
1874
self.asm
1875
.xmm_vpmaxu_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1876
self.asm
1877
.xmm_vpcmpeq_rrr(writable!(lhs), lhs, rhs, kind.lane_size());
1878
self.asm
1879
.xmm_vpcmpeq_rrr(writable!(rhs), rhs, rhs, kind.lane_size());
1880
self.asm.xmm_vpxor_rrr(lhs, rhs, dst);
1881
}
1882
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1883
// Do a less than comparison with the operands swapped.
1884
self.asm
1885
.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Lt)
1886
}
1887
}
1888
Ok(())
1889
}
1890
1891
fn v128_ge(
1892
&mut self,
1893
dst: WritableReg,
1894
lhs: Reg,
1895
rhs: Reg,
1896
kind: VectorCompareKind,
1897
) -> Result<()> {
1898
self.ensure_has_avx()?;
1899
1900
match kind {
1901
VectorCompareKind::I8x16S | VectorCompareKind::I16x8S | VectorCompareKind::I32x4S => {
1902
// Set each lane to maximum value and then compare for equality.
1903
self.asm
1904
.xmm_vpmaxs_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1905
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1906
}
1907
VectorCompareKind::I64x2S => {
1908
// Perform a greater than comparison with operands swapped,
1909
// then invert the results.
1910
self.asm
1911
.xmm_vpcmpgt_rrr(writable!(rhs), rhs, lhs, kind.lane_size());
1912
self.asm.xmm_vpcmpeq_rrr(dst, lhs, lhs, kind.lane_size());
1913
self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
1914
}
1915
VectorCompareKind::I8x16U | VectorCompareKind::I16x8U | VectorCompareKind::I32x4U => {
1916
// Set lanes to maximum values and compare them for equality.
1917
self.asm
1918
.xmm_vpmaxu_rrr(writable!(rhs), lhs, rhs, kind.lane_size());
1919
self.asm.xmm_vpcmpeq_rrr(dst, lhs, rhs, kind.lane_size());
1920
}
1921
VectorCompareKind::F32x4 | VectorCompareKind::F64x2 => {
1922
// Perform a less than or equal comparison on swapped operands.
1923
self.asm
1924
.xmm_vcmpp_rrr(dst, rhs, lhs, kind.lane_size(), VcmpKind::Le)
1925
}
1926
}
1927
1928
Ok(())
1929
}
1930
1931
fn fence(&mut self) -> Result<()> {
1932
self.asm.mfence();
1933
Ok(())
1934
}
1935
1936
fn v128_not(&mut self, dst: WritableReg) -> Result<()> {
1937
self.ensure_has_avx()?;
1938
1939
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
1940
// First, we initialize `tmp` with all ones by comparing it with
1941
// itself.
1942
masm.asm
1943
.xmm_vpcmpeq_rrr(tmp.writable(), tmp.inner(), tmp.inner(), OperandSize::S32);
1944
// Then we `xor` tmp and `dst` together, yielding `!dst`.
1945
masm.asm.xmm_vpxor_rrr(tmp.inner(), dst.to_reg(), dst);
1946
Ok(())
1947
})
1948
}
1949
1950
fn v128_and(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1951
self.ensure_has_avx()?;
1952
self.asm.xmm_vpand_rrr(src1, src2, dst);
1953
Ok(())
1954
}
1955
1956
fn v128_and_not(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1957
self.ensure_has_avx()?;
1958
self.asm.xmm_vpandn_rrr(src1, src2, dst);
1959
Ok(())
1960
}
1961
1962
fn v128_or(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1963
self.ensure_has_avx()?;
1964
self.asm.xmm_vpor_rrr(dst, src1, src2);
1965
Ok(())
1966
}
1967
1968
fn v128_xor(&mut self, src1: Reg, src2: Reg, dst: WritableReg) -> Result<()> {
1969
self.ensure_has_avx()?;
1970
self.asm.xmm_vpxor_rrr(src1, src2, dst);
1971
Ok(())
1972
}
1973
1974
fn v128_bitselect(&mut self, src1: Reg, src2: Reg, mask: Reg, dst: WritableReg) -> Result<()> {
1975
self.ensure_has_avx()?;
1976
1977
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
1978
masm.v128_and(src1, mask, tmp.writable())?;
1979
masm.v128_and_not(mask, src2, dst)?;
1980
masm.v128_or(dst.to_reg(), tmp.inner(), dst)?;
1981
Ok(())
1982
})
1983
}
1984
1985
fn v128_any_true(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
1986
self.ensure_has_avx()?;
1987
self.asm.xmm_vptest(src, src);
1988
self.asm.setcc(IntCmpKind::Ne, dst);
1989
Ok(())
1990
}
1991
1992
fn v128_convert(&mut self, src: Reg, dst: WritableReg, kind: V128ConvertKind) -> Result<()> {
1993
self.ensure_has_avx()?;
1994
match kind {
1995
V128ConvertKind::I32x4S => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF32),
1996
V128ConvertKind::I32x4LowS => self.asm.xmm_vcvt_rr(src, dst, VcvtKind::I32ToF64),
1997
V128ConvertKind::I32x4U => {
1998
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
1999
// Split each 32-bit integer into 16-bit parts.
2000
// `scratch` will contain the low bits and `dst` will contain
2001
// the high bits.
2002
masm.asm
2003
.xmm_vpsll_rri(src, scratch.writable(), 0x10, kind.src_lane_size());
2004
masm.asm.xmm_vpsrl_rri(
2005
scratch.inner(),
2006
scratch.writable(),
2007
0x10,
2008
kind.src_lane_size(),
2009
);
2010
masm.asm
2011
.xmm_vpsub_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2012
2013
// Convert the low bits in `scratch` to floating point numbers.
2014
masm.asm
2015
.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
2016
2017
// Prevent overflow by right shifting high bits.
2018
masm.asm
2019
.xmm_vpsrl_rri(dst.to_reg(), dst, 1, kind.src_lane_size());
2020
// Convert high bits in `dst` to floating point numbers.
2021
masm.asm.xmm_vcvt_rr(dst.to_reg(), dst, VcvtKind::I32ToF32);
2022
// Double high bits in `dst` to reverse right shift.
2023
masm.asm
2024
.xmm_vaddp_rrr(dst.to_reg(), dst.to_reg(), dst, kind.src_lane_size());
2025
// Add high bits in `dst` to low bits in `scratch`.
2026
masm.asm.xmm_vaddp_rrr(
2027
dst.to_reg(),
2028
scratch.inner(),
2029
dst,
2030
kind.src_lane_size(),
2031
);
2032
});
2033
}
2034
V128ConvertKind::I32x4LowU => {
2035
// See
2036
// https://github.com/bytecodealliance/wasmtime/blob/bb886ffc3c81a476d8ba06311ff2dede15a6f7e1/cranelift/codegen/src/isa/x64/lower.isle#L3668
2037
// for details on the Cranelift AVX implementation.
2038
// Use `vunpcklp` to create doubles from the integers.
2039
// Interleaving 0x1.0p52 (i.e., 0x43300000) with the integers
2040
// creates a byte array for a double that sets the mantissa
2041
// bits to the original integer value.
2042
let conversion_constant = self
2043
.asm
2044
.add_constant(&[0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x30, 0x43]);
2045
self.asm
2046
.xmm_vunpcklp_rrm(src, &conversion_constant, dst, kind.src_lane_size());
2047
// Subtract the 0x1.0p52 added above.
2048
let conversion_constant = self.asm.add_constant(&[
2049
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00,
2050
0x00, 0x30, 0x43,
2051
]);
2052
self.asm.xmm_vsub_rrm(
2053
dst.to_reg(),
2054
&conversion_constant,
2055
dst,
2056
kind.dst_lane_size(),
2057
);
2058
}
2059
}
2060
Ok(())
2061
}
2062
2063
fn v128_narrow(
2064
&mut self,
2065
src1: Reg,
2066
src2: Reg,
2067
dst: WritableReg,
2068
kind: V128NarrowKind,
2069
) -> Result<()> {
2070
self.ensure_has_avx()?;
2071
match kind {
2072
V128NarrowKind::I16x8S | V128NarrowKind::I32x4S => {
2073
self.asm
2074
.xmm_vpackss_rrr(src1, src2, dst, kind.dst_lane_size())
2075
}
2076
V128NarrowKind::I16x8U | V128NarrowKind::I32x4U => {
2077
self.asm
2078
.xmm_vpackus_rrr(src1, src2, dst, kind.dst_lane_size())
2079
}
2080
}
2081
Ok(())
2082
}
2083
2084
fn v128_demote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2085
self.ensure_has_avx()?;
2086
self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F64ToF32);
2087
Ok(())
2088
}
2089
2090
fn v128_promote(&mut self, src: Reg, dst: WritableReg) -> Result<()> {
2091
self.ensure_has_avx()?;
2092
self.asm.xmm_vcvt_rr(src, dst, VcvtKind::F32ToF64);
2093
Ok(())
2094
}
2095
2096
fn v128_extend(&mut self, src: Reg, dst: WritableReg, kind: V128ExtendKind) -> Result<()> {
2097
self.ensure_has_avx()?;
2098
match kind {
2099
V128ExtendKind::LowI8x16S
2100
| V128ExtendKind::LowI8x16U
2101
| V128ExtendKind::LowI16x8S
2102
| V128ExtendKind::LowI16x8U
2103
| V128ExtendKind::LowI32x4S
2104
| V128ExtendKind::LowI32x4U => self.asm.xmm_vpmov_rr(src, dst, kind.into()),
2105
V128ExtendKind::HighI8x16S | V128ExtendKind::HighI16x8S => {
2106
self.asm.xmm_vpalignr_rrr(src, src, dst, 0x8);
2107
self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2108
}
2109
V128ExtendKind::HighI8x16U | V128ExtendKind::HighI16x8U => {
2110
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2111
masm.asm
2112
.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2113
masm.asm
2114
.xmm_vpunpckh_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2115
});
2116
}
2117
V128ExtendKind::HighI32x4S => {
2118
// Move the 3rd element (i.e., 0b10) to the 1st (rightmost)
2119
// position and the 4th element (i.e., 0b11) to the 2nd (second
2120
// from the right) position and then perform the extend.
2121
self.asm
2122
.xmm_vpshuf_rr(src, dst, 0b11_10_11_10, kind.src_lane_size());
2123
self.asm.xmm_vpmov_rr(dst.to_reg(), dst, kind.into());
2124
}
2125
V128ExtendKind::HighI32x4U => {
2126
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2127
// Set `scratch` to a vector 0s.
2128
masm.asm.xmm_vxorp_rrr(
2129
scratch.inner(),
2130
scratch.inner(),
2131
scratch.writable(),
2132
kind.src_lane_size(),
2133
);
2134
// Interleave the 0 bits into the two 32-bit integers to zero extend them.
2135
masm.asm
2136
.xmm_vunpckhp_rrr(src, scratch.inner(), dst, kind.src_lane_size());
2137
});
2138
}
2139
}
2140
Ok(())
2141
}
2142
2143
fn v128_add(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128AddKind) -> Result<()> {
2144
self.ensure_has_avx()?;
2145
match kind {
2146
V128AddKind::F32x4 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S32),
2147
V128AddKind::F64x2 => self.asm.xmm_vaddp_rrr(lhs, rhs, dst, OperandSize::S64),
2148
V128AddKind::I8x16 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S8),
2149
V128AddKind::I8x16SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S8),
2150
V128AddKind::I8x16SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S8),
2151
V128AddKind::I16x8 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S16),
2152
V128AddKind::I16x8SatS => self.asm.xmm_vpadds_rrr(dst, lhs, rhs, OperandSize::S16),
2153
V128AddKind::I16x8SatU => self.asm.xmm_vpaddus_rrr(dst, lhs, rhs, OperandSize::S16),
2154
V128AddKind::I32x4 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S32),
2155
V128AddKind::I64x2 => self.asm.xmm_vpadd_rrr(lhs, rhs, dst, OperandSize::S64),
2156
};
2157
Ok(())
2158
}
2159
2160
fn v128_sub(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, kind: V128SubKind) -> Result<()> {
2161
self.ensure_has_avx()?;
2162
match kind {
2163
V128SubKind::F32x4 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S32),
2164
V128SubKind::F64x2 => self.asm.xmm_vsubp_rrr(lhs, rhs, dst, OperandSize::S64),
2165
V128SubKind::I8x16 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S8),
2166
V128SubKind::I8x16SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S8),
2167
V128SubKind::I8x16SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S8),
2168
V128SubKind::I16x8 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S16),
2169
V128SubKind::I16x8SatS => self.asm.xmm_vpsubs_rrr(dst, lhs, rhs, OperandSize::S16),
2170
V128SubKind::I16x8SatU => self.asm.xmm_vpsubus_rrr(dst, lhs, rhs, OperandSize::S16),
2171
V128SubKind::I32x4 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S32),
2172
V128SubKind::I64x2 => self.asm.xmm_vpsub_rrr(lhs, rhs, dst, OperandSize::S64),
2173
};
2174
Ok(())
2175
}
2176
2177
fn v128_mul(
2178
&mut self,
2179
context: &mut CodeGenContext<Emission>,
2180
kind: V128MulKind,
2181
) -> Result<()> {
2182
self.ensure_has_avx()?;
2183
2184
let rhs = context.pop_to_reg(self, None)?;
2185
let lhs = context.pop_to_reg(self, None)?;
2186
2187
let mul_i64x2_avx512 = |this: &mut Self| {
2188
this.asm.vpmullq(lhs.reg, rhs.reg, writable!(lhs.reg));
2189
};
2190
2191
let mul_i64x2_fallback = |this: &mut Self,
2192
context: &mut CodeGenContext<Emission>|
2193
-> Result<()> {
2194
// Standard AVX doesn't have an instruction for i64x2 multiplication, instead, we have to fallback
2195
// to an instruction sequence using 32bits multiplication (taken from cranelift
2196
// implementation, in `isa/x64/lower.isle`):
2197
//
2198
// > Otherwise, for i64x2 multiplication we describe a lane A as being composed of
2199
// > a 32-bit upper half "Ah" and a 32-bit lower half "Al". The 32-bit long hand
2200
// > multiplication can then be written as:
2201
//
2202
// > Ah Al
2203
// > * Bh Bl
2204
// > -----
2205
// > Al * Bl
2206
// > + (Ah * Bl) << 32
2207
// > + (Al * Bh) << 32
2208
//
2209
// > So for each lane we will compute:
2210
//
2211
// > A * B = (Al * Bl) + ((Ah * Bl) + (Al * Bh)) << 32
2212
//
2213
// > Note, the algorithm will use `pmuludq` which operates directly on the lower
2214
// > 32-bit (`Al` or `Bl`) of a lane and writes the result to the full 64-bits of
2215
// > the lane of the destination. For this reason we don't need shifts to isolate
2216
// > the lower 32-bits, however, we will need to use shifts to isolate the high
2217
// > 32-bits when doing calculations, i.e., `Ah == A >> 32`.
2218
2219
let tmp2 = context.any_fpr(this)?;
2220
this.with_scratch::<FloatScratch, _>(|this, tmp1| {
2221
// tmp1 = lhs_hi = (lhs >> 32)
2222
this.asm
2223
.xmm_vpsrl_rri(lhs.reg, tmp1.writable(), 32, OperandSize::S64);
2224
2225
// tmp2 = lhs_hi * rhs_low = tmp1 * rhs
2226
this.asm
2227
.xmm_vpmuldq_rrr(tmp1.inner(), rhs.reg, writable!(tmp2));
2228
2229
// tmp1 = rhs_hi = rhs >> 32
2230
this.asm
2231
.xmm_vpsrl_rri(rhs.reg, tmp1.writable(), 32, OperandSize::S64);
2232
2233
// tmp1 = lhs_low * rhs_high = tmp1 * lhs
2234
this.asm
2235
.xmm_vpmuludq_rrr(tmp1.inner(), lhs.reg, tmp1.writable());
2236
2237
// tmp1 = ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2238
this.asm
2239
.xmm_vpadd_rrr(tmp1.inner(), tmp2, tmp1.writable(), OperandSize::S64);
2240
2241
//tmp1 = tmp1 << 32
2242
this.asm
2243
.xmm_vpsll_rri(tmp1.inner(), tmp1.writable(), 32, OperandSize::S64);
2244
2245
// tmp2 = lhs_lo + rhs_lo
2246
this.asm.xmm_vpmuludq_rrr(lhs.reg, rhs.reg, writable!(tmp2));
2247
2248
// finally, with `lhs` as destination:
2249
// lhs = (lhs_low * rhs_low) + ((lhs_hi * rhs_low) + (lhs_lo * rhs_hi)) = tmp1 + tmp2
2250
this.asm
2251
.xmm_vpadd_rrr(tmp1.inner(), tmp2, writable!(lhs.reg), OperandSize::S64);
2252
});
2253
2254
context.free_reg(tmp2);
2255
2256
Ok(())
2257
};
2258
2259
match kind {
2260
V128MulKind::F32x4 => {
2261
self.asm
2262
.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2263
}
2264
V128MulKind::F64x2 => {
2265
self.asm
2266
.xmm_vmulp_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S64)
2267
}
2268
V128MulKind::I16x8 => {
2269
self.asm
2270
.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S16)
2271
}
2272
V128MulKind::I32x4 => {
2273
self.asm
2274
.xmm_vpmull_rrr(lhs.reg, rhs.reg, writable!(lhs.reg), OperandSize::S32)
2275
}
2276
// This is the fast path when AVX512 is available.
2277
V128MulKind::I64x2
2278
if self.ensure_has_avx512vl().is_ok() && self.ensure_has_avx512dq().is_ok() =>
2279
{
2280
mul_i64x2_avx512(self)
2281
}
2282
// Otherwise, we emit AVX fallback sequence.
2283
V128MulKind::I64x2 => mul_i64x2_fallback(self, context)?,
2284
}
2285
2286
context.stack.push(lhs.into());
2287
context.free_reg(rhs);
2288
2289
Ok(())
2290
}
2291
2292
fn v128_abs(&mut self, src: Reg, dst: WritableReg, kind: V128AbsKind) -> Result<()> {
2293
self.ensure_has_avx()?;
2294
2295
match kind {
2296
V128AbsKind::I8x16 | V128AbsKind::I16x8 | V128AbsKind::I32x4 => {
2297
self.asm.xmm_vpabs_rr(src, dst, kind.lane_size())
2298
}
2299
V128AbsKind::I64x2 => {
2300
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2301
// Perform an arithmetic right shift of 31 bits. If the number
2302
// is positive, this will result in all zeroes in the upper
2303
// 32-bits. If the number is negative, this will result in all
2304
// ones in the upper 32-bits.
2305
masm.asm
2306
.xmm_vpsra_rri(src, scratch.writable(), 0x1f, OperandSize::S32);
2307
// Copy the ones and zeroes in the high bits of each 64-bit
2308
// lane to the low bits of each 64-bit lane.
2309
masm.asm.xmm_vpshuf_rr(
2310
scratch.inner(),
2311
scratch.writable(),
2312
0b11_11_01_01,
2313
OperandSize::S32,
2314
);
2315
// Flip the bits in lanes that were negative in `src` and leave
2316
// the positive lanes as they are. Positive lanes will have a
2317
// zero mask in `scratch` so xor doesn't affect them.
2318
masm.asm.xmm_vpxor_rrr(src, scratch.inner(), dst);
2319
// Subtract the mask from the results of xor which will
2320
// complete the two's complement for lanes which were negative.
2321
masm.asm
2322
.xmm_vpsub_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2323
});
2324
}
2325
V128AbsKind::F32x4 | V128AbsKind::F64x2 => {
2326
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2327
// Create a mask of all ones.
2328
masm.asm.xmm_vpcmpeq_rrr(
2329
scratch.writable(),
2330
scratch.inner(),
2331
scratch.inner(),
2332
kind.lane_size(),
2333
);
2334
// Right shift the mask so each lane is a single zero followed
2335
// by all ones.
2336
masm.asm.xmm_vpsrl_rri(
2337
scratch.inner(),
2338
scratch.writable(),
2339
0x1,
2340
kind.lane_size(),
2341
);
2342
// Use the mask to zero the sign bit in each lane which will
2343
// make the float value positive.
2344
masm.asm
2345
.xmm_vandp_rrr(src, scratch.inner(), dst, kind.lane_size());
2346
});
2347
}
2348
}
2349
Ok(())
2350
}
2351
2352
fn v128_neg(&mut self, op: WritableReg, kind: V128NegKind) -> Result<()> {
2353
self.ensure_has_avx()?;
2354
2355
match kind {
2356
V128NegKind::I8x16 | V128NegKind::I16x8 | V128NegKind::I32x4 | V128NegKind::I64x2 => {
2357
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2358
masm.v128_xor(tmp.inner(), tmp.inner(), tmp.writable())?;
2359
masm.v128_sub(tmp.inner(), op.to_reg(), op, kind.into())?;
2360
wasmtime_environ::error::Ok(())
2361
})?;
2362
}
2363
V128NegKind::F32x4 | V128NegKind::F64x2 => {
2364
self.with_scratch::<FloatScratch, _>(|masm, tmp| {
2365
// Create a mask of all 1s.
2366
masm.asm.xmm_vpcmpeq_rrr(
2367
tmp.writable(),
2368
tmp.inner(),
2369
tmp.inner(),
2370
kind.lane_size(),
2371
);
2372
// Left shift the lanes in the mask so only the sign bit in the
2373
// mask is set to 1.
2374
masm.asm.xmm_vpsll_rri(
2375
tmp.inner(),
2376
tmp.writable(),
2377
(kind.lane_size().num_bits() - 1) as u32,
2378
kind.lane_size(),
2379
);
2380
// Use the mask to flip the sign bit.
2381
masm.asm
2382
.xmm_vxorp_rrr(op.to_reg(), tmp.inner(), op, kind.lane_size());
2383
});
2384
}
2385
}
2386
Ok(())
2387
}
2388
2389
fn v128_shift(
2390
&mut self,
2391
context: &mut CodeGenContext<Emission>,
2392
lane_width: OperandSize,
2393
kind: ShiftKind,
2394
) -> Result<()> {
2395
self.ensure_has_avx()?;
2396
let shift_amount = context.pop_to_reg(self, None)?.reg;
2397
let operand = context.pop_to_reg(self, None)?.reg;
2398
let amount_mask = lane_width.num_bits() - 1;
2399
2400
self.and(
2401
writable!(shift_amount),
2402
shift_amount,
2403
RegImm::i32(amount_mask as i32),
2404
OperandSize::S32,
2405
)?;
2406
2407
let move_to_tmp_xmm = |this: &mut Self, tmp_xmm: Scratch| {
2408
this.asm
2409
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2410
};
2411
2412
// A helper for deciding between `vpsllw` and `vpsrlw` in
2413
// `shift_i8x16`.
2414
enum Direction {
2415
Left,
2416
Right,
2417
}
2418
2419
let shift_i8x16 = |this: &mut Self,
2420
masks: &'static [u8],
2421
direction: Direction|
2422
-> Result<()> {
2423
// The case for i8x16 is a little bit trickier because x64 doesn't provide a 8bit
2424
// shift instruction. Instead, we shift as 16bits, and then mask the bits in the
2425
// 8bits lane, for example (with 2 8bits lanes):
2426
// - Before shifting:
2427
// 01001101 11101110
2428
// - shifting by 2 left:
2429
// 00110111 10111000
2430
// ^^_ these bits come from the previous byte, and need to be masked.
2431
// - The mask:
2432
// 11111100 11111111
2433
// - After masking:
2434
// 00110100 10111000
2435
//
2436
// The mask is loaded from a well known memory, depending on the shift amount.
2437
2438
this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {
2439
this.asm
2440
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2441
2442
// Perform the 16-bit shift.
2443
match direction {
2444
Direction::Left => this.asm.xmm_vpsll_rrr(
2445
operand,
2446
tmp_xmm.inner(),
2447
writable!(operand),
2448
OperandSize::S16,
2449
),
2450
Direction::Right => this.asm.xmm_vpsrl_rrr(
2451
operand,
2452
tmp_xmm.inner(),
2453
writable!(operand),
2454
OperandSize::S16,
2455
),
2456
}
2457
2458
// Get a handle to the masks array constant.
2459
let masks_addr = this.asm.add_constant(masks);
2460
2461
this.with_scratch::<IntScratch, _>(|this, tmp| {
2462
// Load the masks array effective address into the tmp register.
2463
this.asm.lea(&masks_addr, tmp.writable(), OperandSize::S64);
2464
2465
// Compute the offset of the mask that we need to use. This is shift_amount * 16 ==
2466
// shift_amount << 4.
2467
this.asm
2468
.shift_ir(4, writable!(shift_amount), ShiftKind::Shl, OperandSize::S32);
2469
2470
// Load the mask to tmp_xmm.
2471
this.asm.xmm_vmovdqu_mr(
2472
&Address::ImmRegRegShift {
2473
simm32: 0,
2474
base: tmp.inner(),
2475
index: shift_amount,
2476
shift: 0,
2477
},
2478
tmp_xmm.writable(),
2479
MemFlags::trusted(),
2480
);
2481
});
2482
2483
// Mask unwanted bits from operand.
2484
this.asm
2485
.xmm_vpand_rrr(tmp_xmm.inner(), operand, writable!(operand));
2486
Ok(())
2487
})
2488
};
2489
2490
let i64x2_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2491
const SIGN_MASK: u128 = 0x8000000000000000_8000000000000000;
2492
2493
// AVX doesn't have an instruction for i64x2 signed right shift. Instead we use the
2494
// following formula (from hacker's delight 2-7), where x is the value and n the shift
2495
// amount, for each lane:
2496
// t = (1 << 63) >> n; ((x >> n) ^ t) - t
2497
2498
// We need an extra scratch register:
2499
let tmp_xmm2 = context.any_fpr(this)?;
2500
2501
this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {
2502
this.asm
2503
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2504
2505
let cst = this.asm.add_constant(&SIGN_MASK.to_le_bytes());
2506
2507
this.asm
2508
.xmm_vmovdqu_mr(&cst, writable!(tmp_xmm2), MemFlags::trusted());
2509
this.asm.xmm_vpsrl_rrr(
2510
tmp_xmm2,
2511
tmp_xmm.inner(),
2512
writable!(tmp_xmm2),
2513
OperandSize::S64,
2514
);
2515
this.asm.xmm_vpsrl_rrr(
2516
operand,
2517
tmp_xmm.inner(),
2518
writable!(operand),
2519
OperandSize::S64,
2520
);
2521
});
2522
this.asm
2523
.xmm_vpxor_rrr(operand, tmp_xmm2, writable!(operand));
2524
this.asm
2525
.xmm_vpsub_rrr(operand, tmp_xmm2, writable!(operand), OperandSize::S64);
2526
2527
context.free_reg(tmp_xmm2);
2528
2529
Ok(())
2530
};
2531
2532
let i8x16_shr_s = |this: &mut Self, context: &mut CodeGenContext<Emission>| -> Result<()> {
2533
// Since the x86 instruction set does not have an 8x16 shift instruction and the
2534
// approach used for `ishl` and `ushr` cannot be easily used (the masks do not
2535
// preserve the sign), we use a different approach here: separate the low and
2536
// high lanes, shift them separately, and merge them into the final result.
2537
//
2538
// Visually, this looks like the following, where `src.i8x16 = [s0, s1, ...,
2539
// s15]:
2540
//
2541
// lo.i16x8 = [(s0, s0), (s1, s1), ..., (s7, s7)]
2542
// shifted_lo.i16x8 = shift each lane of `low`
2543
// hi.i16x8 = [(s8, s8), (s9, s9), ..., (s15, s15)]
2544
// shifted_hi.i16x8 = shift each lane of `high`
2545
// result = [s0'', s1'', ..., s15'']
2546
2547
// In order for `packsswb` later to only use the high byte of each
2548
// 16x8 lane, we shift right an extra 8 bits, relying on `psraw` to
2549
// fill in the upper bits appropriately.
2550
let tmp_lo = context.any_fpr(this)?;
2551
let tmp_hi = context.any_fpr(this)?;
2552
2553
this.with_scratch::<FloatScratch, _>(|this, tmp_xmm| {
2554
this.asm
2555
.add_ir(8, writable!(shift_amount), OperandSize::S32);
2556
this.asm
2557
.avx_gpr_to_xmm(shift_amount, tmp_xmm.writable(), OperandSize::S32);
2558
2559
// Extract lower and upper bytes.
2560
this.asm
2561
.xmm_vpunpckl_rrr(operand, operand, writable!(tmp_lo), OperandSize::S8);
2562
this.asm
2563
.xmm_vpunpckh_rrr(operand, operand, writable!(tmp_hi), OperandSize::S8);
2564
2565
// Perform 16bit right shift of upper and lower bytes.
2566
this.asm.xmm_vpsra_rrr(
2567
tmp_lo,
2568
tmp_xmm.inner(),
2569
writable!(tmp_lo),
2570
OperandSize::S16,
2571
);
2572
this.asm.xmm_vpsra_rrr(
2573
tmp_hi,
2574
tmp_xmm.inner(),
2575
writable!(tmp_hi),
2576
OperandSize::S16,
2577
);
2578
});
2579
2580
// Merge lower and upper bytes back.
2581
this.asm
2582
.xmm_vpackss_rrr(tmp_lo, tmp_hi, writable!(operand), OperandSize::S8);
2583
2584
context.free_reg(tmp_lo);
2585
context.free_reg(tmp_hi);
2586
2587
Ok(())
2588
};
2589
2590
match (lane_width, kind) {
2591
// shl
2592
(OperandSize::S8, ShiftKind::Shl) => {
2593
shift_i8x16(self, &I8X16_ISHL_MASKS, Direction::Left)?
2594
}
2595
(OperandSize::S16, ShiftKind::Shl) => {
2596
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2597
move_to_tmp_xmm(masm, tmp_xmm);
2598
masm.asm.xmm_vpsll_rrr(
2599
operand,
2600
tmp_xmm.inner(),
2601
writable!(operand),
2602
OperandSize::S16,
2603
);
2604
})
2605
}
2606
(OperandSize::S32, ShiftKind::Shl) => {
2607
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2608
move_to_tmp_xmm(masm, tmp_xmm);
2609
masm.asm.xmm_vpsll_rrr(
2610
operand,
2611
tmp_xmm.inner(),
2612
writable!(operand),
2613
OperandSize::S32,
2614
);
2615
})
2616
}
2617
(OperandSize::S64, ShiftKind::Shl) => {
2618
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2619
move_to_tmp_xmm(masm, tmp_xmm);
2620
masm.asm.xmm_vpsll_rrr(
2621
operand,
2622
tmp_xmm.inner(),
2623
writable!(operand),
2624
OperandSize::S64,
2625
);
2626
})
2627
}
2628
// shr_u
2629
(OperandSize::S8, ShiftKind::ShrU) => {
2630
shift_i8x16(self, &I8X16_USHR_MASKS, Direction::Right)?
2631
}
2632
(OperandSize::S16, ShiftKind::ShrU) => {
2633
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2634
move_to_tmp_xmm(masm, tmp_xmm);
2635
masm.asm.xmm_vpsrl_rrr(
2636
operand,
2637
tmp_xmm.inner(),
2638
writable!(operand),
2639
OperandSize::S16,
2640
);
2641
})
2642
}
2643
(OperandSize::S32, ShiftKind::ShrU) => {
2644
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2645
move_to_tmp_xmm(masm, tmp_xmm);
2646
masm.asm.xmm_vpsrl_rrr(
2647
operand,
2648
tmp_xmm.inner(),
2649
writable!(operand),
2650
OperandSize::S32,
2651
);
2652
})
2653
}
2654
(OperandSize::S64, ShiftKind::ShrU) => {
2655
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2656
move_to_tmp_xmm(masm, tmp_xmm);
2657
masm.asm.xmm_vpsrl_rrr(
2658
operand,
2659
tmp_xmm.inner(),
2660
writable!(operand),
2661
OperandSize::S64,
2662
);
2663
})
2664
}
2665
// shr_s
2666
(OperandSize::S8, ShiftKind::ShrS) => i8x16_shr_s(self, context)?,
2667
(OperandSize::S16, ShiftKind::ShrS) => {
2668
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2669
move_to_tmp_xmm(masm, tmp_xmm);
2670
masm.asm.xmm_vpsra_rrr(
2671
operand,
2672
tmp_xmm.inner(),
2673
writable!(operand),
2674
OperandSize::S16,
2675
);
2676
})
2677
}
2678
(OperandSize::S32, ShiftKind::ShrS) => {
2679
self.with_scratch::<FloatScratch, _>(|masm, tmp_xmm| {
2680
move_to_tmp_xmm(masm, tmp_xmm);
2681
masm.asm.xmm_vpsra_rrr(
2682
operand,
2683
tmp_xmm.inner(),
2684
writable!(operand),
2685
OperandSize::S32,
2686
);
2687
})
2688
}
2689
(OperandSize::S64, ShiftKind::ShrS) => i64x2_shr_s(self, context)?,
2690
2691
_ => bail!(CodeGenError::invalid_operand_combination()),
2692
}
2693
2694
context.free_reg(shift_amount);
2695
context
2696
.stack
2697
.push(TypedReg::new(WasmValType::V128, operand).into());
2698
Ok(())
2699
}
2700
2701
fn v128_q15mulr_sat_s(
2702
&mut self,
2703
lhs: Reg,
2704
rhs: Reg,
2705
dst: WritableReg,
2706
size: OperandSize,
2707
) -> Result<()> {
2708
self.ensure_has_avx()?;
2709
2710
self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);
2711
2712
// Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15
2713
// format) because of how `vpmulhrs` handles rounding. `vpmulhrs`
2714
// produces 0x8000 in that case when the correct result is 0x7FFF (that
2715
// is, +1) so need to check if the result is 0x8000 and flip the bits
2716
// of the result if it is.
2717
let address = self.asm.add_constant(&[
2718
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
2719
0x00, 0x80,
2720
]);
2721
self.asm
2722
.xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
2723
self.asm.xmm_vpxor_rrr(dst.to_reg(), rhs, dst);
2724
Ok(())
2725
}
2726
2727
fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2728
self.ensure_has_avx()?;
2729
2730
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2731
// Create a mask of all 0s.
2732
masm.asm
2733
.xmm_vpxor_rrr(scratch.inner(), scratch.inner(), scratch.writable());
2734
// Sets lane in `dst` to not zero if `src` lane was zero, and lane in
2735
// `dst` to zero if `src` lane was not zero.
2736
masm.asm
2737
.xmm_vpcmpeq_rrr(writable!(src), src, scratch.inner(), size);
2738
// Sets ZF if all values are zero (i.e., if all original values were not zero).
2739
masm.asm.xmm_vptest(src, src);
2740
// Set byte if ZF=1.
2741
});
2742
self.asm.setcc(IntCmpKind::Eq, dst);
2743
Ok(())
2744
}
2745
2746
fn v128_bitmask(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
2747
self.ensure_has_avx()?;
2748
2749
match size {
2750
OperandSize::S8 => self.asm.xmm_vpmovmsk_rr(src, dst, size, OperandSize::S32),
2751
OperandSize::S16 => {
2752
// Signed conversion of 16-bit integers to 8-bit integers.
2753
self.asm
2754
.xmm_vpackss_rrr(src, src, writable!(src), OperandSize::S8);
2755
// Creates a mask from each byte in `src`.
2756
self.asm
2757
.xmm_vpmovmsk_rr(src, dst, OperandSize::S8, OperandSize::S32);
2758
// Removes 8 bits added as a result of the `vpackss` step.
2759
self.asm
2760
.shift_ir(0x8, dst, ShiftKind::ShrU, OperandSize::S32);
2761
}
2762
OperandSize::S32 | OperandSize::S64 => {
2763
self.asm.xmm_vmovskp_rr(src, dst, size, OperandSize::S32)
2764
}
2765
_ => unimplemented!(),
2766
}
2767
2768
Ok(())
2769
}
2770
2771
fn v128_trunc(
2772
&mut self,
2773
context: &mut CodeGenContext<Emission>,
2774
kind: V128TruncKind,
2775
) -> Result<()> {
2776
self.ensure_has_avx()?;
2777
2778
let reg = writable!(context.pop_to_reg(self, None)?.reg);
2779
match kind {
2780
V128TruncKind::F32x4 | V128TruncKind::F64x2 => self.asm.xmm_vroundp_rri(
2781
reg.to_reg(),
2782
reg,
2783
VroundMode::TowardZero,
2784
kind.dst_lane_size(),
2785
),
2786
V128TruncKind::I32x4FromF32x4S => {
2787
self.v128_trunc_sat_f32x4_s(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2788
}
2789
V128TruncKind::I32x4FromF32x4U => {
2790
let temp_reg = writable!(context.any_fpr(self)?);
2791
self.v128_trunc_sat_f32x4_u(
2792
reg,
2793
temp_reg,
2794
kind.src_lane_size(),
2795
kind.dst_lane_size(),
2796
)?;
2797
context.free_reg(temp_reg.to_reg());
2798
}
2799
V128TruncKind::I32x4FromF64x2SZero => {
2800
self.v128_trunc_sat_f64x2_s_zero(reg, kind.src_lane_size())?;
2801
}
2802
V128TruncKind::I32x4FromF64x2UZero => {
2803
self.v128_trunc_sat_f64x2_u_zero(reg, kind.src_lane_size(), kind.dst_lane_size())?;
2804
}
2805
}
2806
2807
context.stack.push(TypedReg::v128(reg.to_reg()).into());
2808
Ok(())
2809
}
2810
2811
fn v128_min(
2812
&mut self,
2813
src1: Reg,
2814
src2: Reg,
2815
dst: WritableReg,
2816
kind: V128MinKind,
2817
) -> Result<()> {
2818
self.ensure_has_avx()?;
2819
2820
match kind {
2821
V128MinKind::I8x16S
2822
| V128MinKind::I8x16U
2823
| V128MinKind::I16x8S
2824
| V128MinKind::I16x8U
2825
| V128MinKind::I32x4S
2826
| V128MinKind::I32x4U => {
2827
match kind {
2828
V128MinKind::I8x16S => {
2829
self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S8)
2830
}
2831
V128MinKind::I8x16U => {
2832
self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S8)
2833
}
2834
V128MinKind::I16x8S => {
2835
self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S16)
2836
}
2837
V128MinKind::I16x8U => {
2838
self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S16)
2839
}
2840
V128MinKind::I32x4S => {
2841
self.asm.xmm_vpmins_rrr(dst, src1, src2, OperandSize::S32)
2842
}
2843
V128MinKind::I32x4U => {
2844
self.asm.xmm_vpminu_rrr(dst, src1, src2, OperandSize::S32)
2845
}
2846
_ => unreachable!(),
2847
};
2848
}
2849
V128MinKind::F32x4 | V128MinKind::F64x2 => {
2850
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2851
// Handling +0 and -0 as well as NaN values are not commutative
2852
// when using `vminp` so we have to compensate.
2853
// Perform two comparison operations with the operands swapped
2854
// and OR the result to propagate 0 (positive and negative) and
2855
// NaN.
2856
masm.asm
2857
.xmm_vminp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2858
masm.asm.xmm_vminp_rrr(src2, src1, dst, kind.lane_size());
2859
// Use a single OR instruction to set the sign bit if either
2860
// result has the sign bit set to correctly propagate -0.
2861
masm.asm
2862
.xmm_vorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2863
});
2864
// Set lanes with NaN to all 1s.
2865
self.asm.xmm_vcmpp_rrr(
2866
writable!(src2),
2867
src2,
2868
dst.to_reg(),
2869
kind.lane_size(),
2870
VcmpKind::Unord,
2871
);
2872
// Doesn't change non-NaN values. For NaN values, sets all bits.
2873
self.asm
2874
.xmm_vorp_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2875
self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2876
}
2877
}
2878
2879
Ok(())
2880
}
2881
2882
fn v128_max(
2883
&mut self,
2884
src1: Reg,
2885
src2: Reg,
2886
dst: WritableReg,
2887
kind: V128MaxKind,
2888
) -> Result<()> {
2889
self.ensure_has_avx()?;
2890
2891
match kind {
2892
V128MaxKind::I8x16S
2893
| V128MaxKind::I8x16U
2894
| V128MaxKind::I16x8S
2895
| V128MaxKind::I16x8U
2896
| V128MaxKind::I32x4S
2897
| V128MaxKind::I32x4U => {
2898
match kind {
2899
V128MaxKind::I8x16S => {
2900
self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S8)
2901
}
2902
V128MaxKind::I8x16U => {
2903
self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S8)
2904
}
2905
V128MaxKind::I16x8S => {
2906
self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S16)
2907
}
2908
V128MaxKind::I16x8U => {
2909
self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S16)
2910
}
2911
V128MaxKind::I32x4S => {
2912
self.asm.xmm_vpmaxs_rrr(dst, src1, src2, OperandSize::S32)
2913
}
2914
V128MaxKind::I32x4U => {
2915
self.asm.xmm_vpmaxu_rrr(dst, src1, src2, OperandSize::S32)
2916
}
2917
_ => unreachable!(),
2918
};
2919
}
2920
V128MaxKind::F32x4 | V128MaxKind::F64x2 => {
2921
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2922
// Handling +0 and -0 as well as NaN values are not commutative
2923
// when using `vmaxp` so we have to compensate.
2924
// Perform two comparison operations with the operands swapped
2925
// so we can propagate 0 (positive and negative) and NaNs
2926
// correctly.
2927
2928
masm.asm
2929
.xmm_vmaxp_rrr(src1, src2, scratch.writable(), kind.lane_size());
2930
masm.asm.xmm_vmaxp_rrr(src2, src1, dst, kind.lane_size());
2931
// This combination of XOR, OR, and SUB will set the sign bit
2932
// on a 0 result to the correct value for a max operation.
2933
masm.asm
2934
.xmm_vxorp_rrr(dst.to_reg(), scratch.inner(), dst, kind.lane_size());
2935
masm.asm.xmm_vorp_rrr(
2936
dst.to_reg(),
2937
scratch.inner(),
2938
writable!(src2),
2939
kind.lane_size(),
2940
);
2941
});
2942
self.asm
2943
.xmm_vsub_rrr(src2, dst.to_reg(), dst, kind.lane_size());
2944
// Set lanes of NaN values to 1.
2945
self.asm.xmm_vcmpp_rrr(
2946
writable!(src2),
2947
src2,
2948
src2,
2949
kind.lane_size(),
2950
VcmpKind::Unord,
2951
);
2952
self.canonicalize_nans(writable!(src2), dst, kind.lane_size());
2953
}
2954
}
2955
Ok(())
2956
}
2957
2958
fn v128_extmul(
2959
&mut self,
2960
context: &mut CodeGenContext<Emission>,
2961
kind: V128ExtMulKind,
2962
) -> Result<()> {
2963
self.ensure_has_avx()?;
2964
2965
// The implementation for extmul is not optimized; for simplicity's sake, we simply perform
2966
// an extension followed by a multiplication using already implemented primitives.
2967
2968
let src1 = context.pop_to_reg(self, None)?;
2969
let src2 = context.pop_to_reg(self, None)?;
2970
2971
let ext_kind = kind.into();
2972
self.v128_extend(src1.reg, writable!(src1.reg), ext_kind)?;
2973
self.v128_extend(src2.reg, writable!(src2.reg), ext_kind)?;
2974
2975
context.stack.push(src2.into());
2976
context.stack.push(src1.into());
2977
2978
self.v128_mul(context, kind.into())
2979
}
2980
2981
fn v128_extadd_pairwise(
2982
&mut self,
2983
src: Reg,
2984
dst: WritableReg,
2985
kind: V128ExtAddKind,
2986
) -> Result<()> {
2987
self.ensure_has_avx()?;
2988
2989
match kind {
2990
V128ExtAddKind::I8x16S => {
2991
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
2992
// Use `vpmaddubsw` with a vector of 16 8-bit 1's which will
2993
// sign extend `src` to 16 bits and add adjacent words.
2994
// Need to supply constant as first operand since first operand
2995
// is treated as unsigned and the second operand is signed.
2996
let mask = masm.asm.add_constant(&[1; 16]);
2997
masm.asm.xmm_mov_mr(
2998
&mask,
2999
scratch.writable(),
3000
OperandSize::S128,
3001
MemFlags::trusted(),
3002
);
3003
masm.asm.xmm_vpmaddubsw_rrr(scratch.inner(), src, dst);
3004
});
3005
}
3006
V128ExtAddKind::I8x16U => {
3007
// Same approach as the signed variant but treat `src` as
3008
// unsigned instead of signed by passing it as the first
3009
// operand.
3010
let mask = self.asm.add_constant(&[1; 16]);
3011
self.asm.xmm_vpmaddubsw_rmr(src, &mask, dst);
3012
}
3013
V128ExtAddKind::I16x8S => {
3014
// Similar approach to the two variants above. The vector is 8
3015
// lanes of 16-bit 1's and `vpmaddwd` treats both operands as
3016
// signed.
3017
let mask = self
3018
.asm
3019
.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3020
self.asm.xmm_vpmaddwd_rmr(src, &mask, dst);
3021
}
3022
V128ExtAddKind::I16x8U => {
3023
// Similar approach as the signed variant.
3024
// `vpmaddwd` operates on signed integers and the operand is
3025
// unsigned so the operand needs to be converted to a signed
3026
// format and than that process needs to be reversed after
3027
// `vpmaddwd`.
3028
// Flip the sign bit for 8 16-bit lanes.
3029
let xor_mask = self.asm.add_constant(&[
3030
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00,
3031
0x80, 0x00, 0x80,
3032
]);
3033
self.asm.xmm_vpxor_rmr(src, &xor_mask, dst);
3034
3035
let madd_mask = self
3036
.asm
3037
.add_constant(&[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0]);
3038
self.asm.xmm_vpmaddwd_rmr(dst.to_reg(), &madd_mask, dst);
3039
3040
// Reverse the XOR. The XOR effectively subtracts 32,768 from
3041
// both pairs that are added together so 65,536 (0x10000)
3042
// needs to be added to 4 lanes of 32-bit values.
3043
let add_mask = self
3044
.asm
3045
.add_constant(&[0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]);
3046
self.asm
3047
.xmm_vpadd_rmr(dst.to_reg(), &add_mask, dst, OperandSize::S32);
3048
}
3049
}
3050
Ok(())
3051
}
3052
3053
fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()> {
3054
self.ensure_has_avx()?;
3055
self.asm.xmm_vpmaddwd_rrr(lhs, rhs, dst);
3056
Ok(())
3057
}
3058
3059
fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
3060
self.ensure_has_avx()?;
3061
3062
let reg = writable!(context.pop_to_reg(self, None)?.reg);
3063
let reg2 = writable!(context.any_fpr(self)?);
3064
3065
// This works by using a lookup table to determine the count of bits
3066
// set in the upper 4 bits and lower 4 bits separately and then adding
3067
// the counts.
3068
3069
// A mask to zero out the upper 4 bits in each lane.
3070
let address = self.asm.add_constant(&[
3071
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
3072
0x0F, 0x0F,
3073
]);
3074
3075
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3076
// Zero out the upper 4 bits of each lane.
3077
masm.asm
3078
.xmm_vpand_rrm(reg.to_reg(), &address, scratch.writable());
3079
// Right shift bytes in input by 4 bits to put the upper 4 bits in the
3080
// lower 4 bits.
3081
masm.asm
3082
.xmm_vpsrl_rri(reg.to_reg(), reg, 0x4, OperandSize::S16);
3083
// Zero out the upper 4 bits of each shifted lane.
3084
masm.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
3085
3086
// Write a lookup table of 4 bit values to number of bits set to a
3087
// register so we only perform the memory read once.
3088
// Index (hex) | Value (binary) | Population Count
3089
// 0x0 | 0000 | 0
3090
// 0x1 | 0001 | 1
3091
// 0x2 | 0010 | 1
3092
// 0x3 | 0011 | 2
3093
// 0x4 | 0100 | 1
3094
// 0x5 | 0101 | 2
3095
// 0x6 | 0110 | 2
3096
// 0x7 | 0111 | 3
3097
// 0x8 | 1000 | 1
3098
// 0x9 | 1001 | 2
3099
// 0xA | 1010 | 2
3100
// 0xB | 1011 | 3
3101
// 0xC | 1100 | 2
3102
// 0xD | 1101 | 3
3103
// 0xE | 1110 | 3
3104
// 0xF | 1111 | 4
3105
let address = masm.asm.add_constant(&[
3106
0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
3107
]);
3108
masm.asm
3109
.xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
3110
// Use the upper 4 bits as an index into the lookup table.
3111
masm.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
3112
// Use the lower 4 bits as an index into the lookup table.
3113
masm.asm
3114
.xmm_vpshufb_rrr(scratch.writable(), reg2.to_reg(), scratch.inner());
3115
context.free_reg(reg2.to_reg());
3116
3117
// Add the counts of the upper 4 bits and the lower 4 bits to get the
3118
// total number of bits set.
3119
masm.asm
3120
.xmm_vpadd_rrr(reg.to_reg(), scratch.inner(), reg, OperandSize::S8);
3121
wasmtime_environ::error::Ok(())
3122
})?;
3123
3124
context.stack.push(TypedReg::v128(reg.to_reg()).into());
3125
Ok(())
3126
}
3127
3128
fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3129
self.ensure_has_avx()?;
3130
self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
3131
Ok(())
3132
}
3133
3134
fn v128_div(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3135
self.ensure_has_avx()?;
3136
self.asm.xmm_vdivp_rrr(lhs, rhs, dst, size);
3137
Ok(())
3138
}
3139
3140
fn v128_sqrt(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3141
self.ensure_has_avx()?;
3142
self.asm.xmm_vsqrtp_rr(src, dst, size);
3143
Ok(())
3144
}
3145
3146
fn v128_ceil(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3147
self.ensure_has_avx()?;
3148
self.asm
3149
.xmm_vroundp_rri(src, dst, VroundMode::TowardPositiveInfinity, size);
3150
Ok(())
3151
}
3152
3153
fn v128_floor(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3154
self.ensure_has_avx()?;
3155
self.asm
3156
.xmm_vroundp_rri(src, dst, VroundMode::TowardNegativeInfinity, size);
3157
Ok(())
3158
}
3159
3160
fn v128_nearest(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3161
self.ensure_has_avx()?;
3162
self.asm
3163
.xmm_vroundp_rri(src, dst, VroundMode::TowardNearest, size);
3164
Ok(())
3165
}
3166
3167
fn v128_pmin(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3168
self.ensure_has_avx()?;
3169
// Reverse operands since Wasm specifies returning the first operand if
3170
// either operand is NaN while x86 returns the second operand.
3171
self.asm.xmm_vminp_rrr(rhs, lhs, dst, size);
3172
Ok(())
3173
}
3174
3175
fn v128_pmax(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
3176
self.ensure_has_avx()?;
3177
// Reverse operands since Wasm specifies returning the first operand if
3178
// either operand is NaN while x86 returns the second operand.
3179
self.asm.xmm_vmaxp_rrr(rhs, lhs, dst, size);
3180
Ok(())
3181
}
3182
}
3183
3184
impl MacroAssembler {
3185
/// Create an x64 MacroAssembler.
3186
pub fn new(
3187
ptr_size: impl PtrSize,
3188
shared_flags: settings::Flags,
3189
isa_flags: x64_settings::Flags,
3190
) -> Result<Self> {
3191
let ptr_type: WasmValType = ptr_type_from_ptr_size(ptr_size.size());
3192
3193
Ok(Self {
3194
sp_offset: 0,
3195
sp_max: 0,
3196
stack_max_use_add: None,
3197
asm: Assembler::new(shared_flags.clone(), isa_flags.clone()),
3198
flags: isa_flags,
3199
shared_flags,
3200
ptr_size: ptr_type.try_into()?,
3201
scratch_scope: RegAlloc::from(scratch_gpr_bitset(), scratch_fpr_bitset()),
3202
})
3203
}
3204
3205
/// Add the maximum stack used to a register, recording an obligation to update the
3206
/// add-with-immediate instruction emitted to use the real stack max when the masm is being
3207
/// finalized.
3208
fn add_stack_max(&mut self, reg: Reg) {
3209
assert!(self.stack_max_use_add.is_none());
3210
let patch = PatchableAddToReg::new(reg, OperandSize::S64, &mut self.asm);
3211
self.stack_max_use_add.replace(patch);
3212
}
3213
3214
fn ensure_has_avx(&self) -> Result<()> {
3215
crate::ensure!(self.flags.has_avx(), CodeGenError::UnimplementedForNoAvx);
3216
Ok(())
3217
}
3218
3219
fn ensure_has_avx2(&self) -> Result<()> {
3220
crate::ensure!(self.flags.has_avx2(), CodeGenError::UnimplementedForNoAvx2);
3221
Ok(())
3222
}
3223
3224
fn ensure_has_avx512vl(&self) -> Result<()> {
3225
crate::ensure!(
3226
self.flags.has_avx512vl(),
3227
CodeGenError::UnimplementedForNoAvx512VL
3228
);
3229
Ok(())
3230
}
3231
3232
fn ensure_has_avx512dq(&self) -> Result<()> {
3233
crate::ensure!(
3234
self.flags.has_avx512dq(),
3235
CodeGenError::UnimplementedForNoAvx512DQ
3236
);
3237
Ok(())
3238
}
3239
3240
fn increment_sp(&mut self, bytes: u32) {
3241
self.sp_offset += bytes;
3242
3243
// NOTE: we use `max` here to track the largest stack allocation in `sp_max`. Once we have
3244
// seen the entire function, this value will represent the maximum size for the stack
3245
// frame.
3246
self.sp_max = self.sp_max.max(self.sp_offset);
3247
}
3248
3249
fn decrement_sp(&mut self, bytes: u32) {
3250
assert!(
3251
self.sp_offset >= bytes,
3252
"sp offset = {}; bytes = {}",
3253
self.sp_offset,
3254
bytes
3255
);
3256
self.sp_offset -= bytes;
3257
}
3258
3259
fn load_constant(&mut self, constant: &I, dst: WritableReg, size: OperandSize) -> Result<()> {
3260
match constant {
3261
I::I32(v) => Ok(self.asm.mov_ir(*v as u64, dst, size)),
3262
I::I64(v) => Ok(self.asm.mov_ir(*v, dst, size)),
3263
I::F32(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3264
I::F64(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3265
I::V128(_) => Ok(self.asm.load_fp_const(dst, &constant.to_bytes(), size)),
3266
}
3267
}
3268
3269
/// A common implementation for zero-extend stack loads.
3270
fn load_impl(
3271
&mut self,
3272
src: Address,
3273
dst: WritableReg,
3274
size: OperandSize,
3275
flags: MemFlags,
3276
) -> Result<()> {
3277
if dst.to_reg().is_int() {
3278
let ext = size.extend_to::<Zero>(OperandSize::S64);
3279
self.asm.movzx_mr(&src, dst, ext, flags);
3280
} else {
3281
self.asm.xmm_mov_mr(&src, dst, size, flags);
3282
}
3283
3284
Ok(())
3285
}
3286
3287
/// A common implementation for stack stores.
3288
fn store_impl(
3289
&mut self,
3290
src: RegImm,
3291
dst: Address,
3292
size: OperandSize,
3293
flags: MemFlags,
3294
) -> Result<()> {
3295
let _ = match src {
3296
RegImm::Imm(imm) => match imm {
3297
I::I32(v) => self.asm.mov_im(v as i32, &dst, size, flags),
3298
I::I64(v) => match v.try_into() {
3299
Ok(v) => self.asm.mov_im(v, &dst, size, flags),
3300
Err(_) => {
3301
// If the immediate doesn't sign extend, use a scratch
3302
// register.
3303
self.with_scratch::<IntScratch, _>(|masm, scratch| {
3304
masm.asm.mov_ir(v, scratch.writable(), size);
3305
masm.asm.mov_rm(scratch.inner(), &dst, size, flags);
3306
});
3307
}
3308
},
3309
I::F32(v) => {
3310
let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3311
self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3312
// Always trusted, since we are loading the constant from
3313
// the constant pool.
3314
masm.asm.xmm_mov_mr(
3315
&addr,
3316
float_scratch.writable(),
3317
size,
3318
MemFlags::trusted(),
3319
);
3320
masm.asm
3321
.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3322
});
3323
}
3324
I::F64(v) => {
3325
let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3326
3327
self.with_scratch::<FloatScratch, _>(|masm, float_scratch| {
3328
// Similar to above, always trusted since we are loading the
3329
// constant from the constant pool.
3330
masm.asm.xmm_mov_mr(
3331
&addr,
3332
float_scratch.writable(),
3333
size,
3334
MemFlags::trusted(),
3335
);
3336
masm.asm
3337
.xmm_mov_rm(float_scratch.inner(), &dst, size, flags);
3338
});
3339
}
3340
I::V128(v) => {
3341
let addr = self.asm.add_constant(v.to_le_bytes().as_slice());
3342
self.with_scratch::<FloatScratch, _>(|masm, vector_scratch| {
3343
// Always trusted, since we are loading the constant from
3344
// the constant pool.
3345
masm.asm.xmm_mov_mr(
3346
&addr,
3347
vector_scratch.writable(),
3348
size,
3349
MemFlags::trusted(),
3350
);
3351
masm.asm
3352
.xmm_mov_rm(vector_scratch.inner(), &dst, size, flags);
3353
});
3354
}
3355
},
3356
RegImm::Reg(reg) => {
3357
if reg.is_int() {
3358
self.asm.mov_rm(reg, &dst, size, flags);
3359
} else {
3360
self.asm.xmm_mov_rm(reg, &dst, size, flags);
3361
}
3362
}
3363
};
3364
Ok(())
3365
}
3366
3367
fn ensure_two_argument_form(dst: &Reg, lhs: &Reg) -> Result<()> {
3368
if dst != lhs {
3369
Err(format_err!(CodeGenError::invalid_two_arg_form()))
3370
} else {
3371
Ok(())
3372
}
3373
}
3374
3375
/// The mask to use when performing a `vpshuf` operation for a 64-bit splat.
3376
fn vpshuf_mask_for_64_bit_splats() -> u8 {
3377
// Results in the first 4 bytes and second 4 bytes being
3378
// swapped and then the swapped bytes being copied.
3379
// [d0, d1, d2, d3, d4, d5, d6, d7, ...] yields
3380
// [d4, d5, d6, d7, d0, d1, d2, d3, d4, d5, d6, d7, d0, d1, d2, d3].
3381
0b01_00_01_00
3382
}
3383
3384
fn v128_trunc_sat_f32x4_s(
3385
&mut self,
3386
reg: WritableReg,
3387
src_lane_size: OperandSize,
3388
dst_lane_size: OperandSize,
3389
) -> Result<()> {
3390
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3391
// Create a mask to handle NaN values (1 for not NaN, 0 for
3392
// NaN).
3393
masm.asm.xmm_vcmpp_rrr(
3394
scratch.writable(),
3395
reg.to_reg(),
3396
reg.to_reg(),
3397
src_lane_size,
3398
VcmpKind::Eq,
3399
);
3400
// Zero out any NaN values.
3401
masm.asm
3402
.xmm_vandp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3403
// Create a mask for the sign bits.
3404
masm.asm
3405
.xmm_vpxor_rrr(scratch.inner(), reg.to_reg(), scratch.writable());
3406
// Convert floats to integers.
3407
masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3408
// Apply sign mask to the converted integers.
3409
masm.asm
3410
.xmm_vpand_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3411
// Create a saturation mask of all 1s for negative numbers,
3412
// all 0s for positive numbers. The arithmetic shift will cop
3413
// the sign bit.
3414
masm.asm
3415
.xmm_vpsra_rri(scratch.inner(), scratch.writable(), 0x1F, dst_lane_size);
3416
// Combine converted integers with saturation mask.
3417
masm.asm.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), reg);
3418
Ok(())
3419
})
3420
}
3421
3422
fn v128_trunc_sat_f32x4_u(
3423
&mut self,
3424
reg: WritableReg,
3425
temp_reg: WritableReg,
3426
src_lane_size: OperandSize,
3427
dst_lane_size: OperandSize,
3428
) -> Result<()> {
3429
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3430
// Set scratch to all zeros.
3431
masm.asm.xmm_vxorp_rrr(
3432
reg.to_reg(),
3433
reg.to_reg(),
3434
scratch.writable(),
3435
src_lane_size,
3436
);
3437
// Clamp negative numbers to 0.
3438
masm.asm
3439
.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3440
// Create a vector of all 1s.
3441
masm.asm.xmm_vpcmpeq_rrr(
3442
scratch.writable(),
3443
scratch.inner(),
3444
scratch.inner(),
3445
src_lane_size,
3446
);
3447
// Set scratch to 0x7FFFFFFF (max signed 32-bit integer) by
3448
// performing a logical shift right.
3449
masm.asm
3450
.xmm_vpsrl_rri(scratch.inner(), scratch.writable(), 0x1, src_lane_size);
3451
// Convert max signed int to float as a reference point for saturation.
3452
masm.asm
3453
.xmm_vcvt_rr(scratch.inner(), scratch.writable(), VcvtKind::I32ToF32);
3454
// Convert the floats to integers and put the results in `reg2`.
3455
// This is signed and not unsigned so we need to handle the
3456
// value for the high bit in each lane.
3457
masm.asm
3458
.xmm_vcvt_rr(reg.to_reg(), temp_reg, VcvtKind::F32ToI32);
3459
// Set `reg` lanes to the amount that the value in the lane
3460
// exceeds the maximum signed 32-bit integer.
3461
masm.asm
3462
.xmm_vsub_rrr(reg.to_reg(), scratch.inner(), reg, dst_lane_size);
3463
// Create mask in `scratch` for numbers that are larger than
3464
// the maximum signed 32-bit integer. Lanes that don't fit
3465
// in 32-bits ints will be 1.
3466
masm.asm.xmm_vcmpp_rrr(
3467
scratch.writable(),
3468
scratch.inner(),
3469
reg.to_reg(),
3470
dst_lane_size,
3471
VcmpKind::Le,
3472
);
3473
// Convert the excess over signed 32-bits from floats to integers.
3474
masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F32ToI32);
3475
// Apply large number mask to excess values which will flip the
3476
// bits in any lanes that exceed signed 32-bits. Adding this
3477
// flipped value to the signed value will set the high bit and
3478
// the carry behavior will update the other bits correctly.
3479
masm.asm
3480
.xmm_vpxor_rrr(reg.to_reg(), scratch.inner(), scratch.writable());
3481
// Set `reg` to all 0s.
3482
masm.asm.xmm_vpxor_rrr(reg.to_reg(), reg.to_reg(), reg);
3483
// Ensure excess values are not negative by taking max b/w
3484
// excess values and zero.
3485
masm.asm
3486
.xmm_vpmaxs_rrr(reg, scratch.inner(), reg.to_reg(), dst_lane_size);
3487
});
3488
// Perform the addition between the signed conversion value (in
3489
// `reg2`) and the flipped excess value (in `reg`) to get the
3490
// unsigned value.
3491
self.asm
3492
.xmm_vpadd_rrr(reg.to_reg(), temp_reg.to_reg(), reg, dst_lane_size);
3493
Ok(())
3494
}
3495
3496
fn v128_trunc_sat_f64x2_s_zero(
3497
&mut self,
3498
reg: WritableReg,
3499
src_lane_size: OperandSize,
3500
) -> Result<()> {
3501
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3502
// Create a NaN mask (1s for non-NaN, 0s for NaN).
3503
masm.asm.xmm_vcmpp_rrr(
3504
scratch.writable(),
3505
reg.to_reg(),
3506
reg.to_reg(),
3507
src_lane_size,
3508
VcmpKind::Eq,
3509
);
3510
// Clamp NaN values to maximum 64-bit float that can be
3511
// converted to an i32.
3512
let address = masm.asm.add_constant(&[
3513
0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF, 0xDF, 0x41, 0x00, 0x00, 0xC0, 0xFF, 0xFF, 0xFF,
3514
0xDF, 0x41,
3515
]);
3516
masm.asm
3517
.xmm_vandp_rrm(scratch.inner(), &address, scratch.writable(), src_lane_size);
3518
// Handle the saturation for values too large to fit in an i32.
3519
masm.asm
3520
.xmm_vminp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3521
// Convert the floats to integers.
3522
masm.asm.xmm_vcvt_rr(reg.to_reg(), reg, VcvtKind::F64ToI32);
3523
3524
Ok(())
3525
})
3526
}
3527
3528
fn v128_trunc_sat_f64x2_u_zero(
3529
&mut self,
3530
reg: WritableReg,
3531
src_lane_size: OperandSize,
3532
dst_lane_size: OperandSize,
3533
) -> Result<()> {
3534
self.with_scratch::<FloatScratch, _>(|masm, scratch| {
3535
// Zero out the scratch register.
3536
masm.asm.xmm_vxorp_rrr(
3537
scratch.inner(),
3538
scratch.inner(),
3539
scratch.writable(),
3540
src_lane_size,
3541
);
3542
// Clamp negative values to zero.
3543
masm.asm
3544
.xmm_vmaxp_rrr(reg.to_reg(), scratch.inner(), reg, src_lane_size);
3545
// Clamp value to maximum unsigned 32-bit integer value
3546
// (0x41F0000000000000).
3547
let address = masm.asm.add_constant(&[
3548
0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF, 0xEF, 0x41, 0x00, 0x00, 0xE0, 0xFF, 0xFF, 0xFF,
3549
0xEF, 0x41,
3550
]);
3551
masm.asm
3552
.xmm_vminp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3553
// Truncate floating point values.
3554
masm.asm
3555
.xmm_vroundp_rri(reg.to_reg(), reg, VroundMode::TowardZero, src_lane_size);
3556
// Add 2^52 (doubles store 52 bits in their mantissa) to each
3557
// lane causing values in the lower bits to be shifted into
3558
// position for integer conversion.
3559
let address = masm.asm.add_constant(&[
3560
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x30, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3561
0x30, 0x43,
3562
]);
3563
masm.asm
3564
.xmm_vaddp_rrm(reg.to_reg(), &address, reg, src_lane_size);
3565
// Takes lanes 0 and 2 from `reg` (converted values) and lanes
3566
// 0 and 2 from `scratch` (zeroes) to put the converted ints in
3567
// the lower lanes and zeroes in the upper lanes.
3568
masm.asm.xmm_vshufp_rrri(
3569
reg.to_reg(),
3570
scratch.inner(),
3571
reg,
3572
0b10_00_10_00,
3573
dst_lane_size,
3574
);
3575
Ok(())
3576
})
3577
}
3578
3579
/// Given a vector of floats where lanes with NaN values are set to all 1s
3580
/// in `reg` and a vector register `dst` with a mix of non-NaN values and
3581
/// possibly non-canonical NaN values, this canonicalize any NaNs in `dst`.
3582
fn canonicalize_nans(&mut self, mask: WritableReg, dst: WritableReg, size: OperandSize) {
3583
// Canonical NaNs do not preserve the sign bit, have the exponent bits
3584
// all set, and have only the high bit of the mantissa set so shift by
3585
// that number.
3586
// The mask we're producing in this step will be inverted in the next
3587
// step.
3588
let amount_to_shift = 1 + size.mantissa_bits() + 1;
3589
self.asm
3590
.xmm_vpsrl_rri(mask.to_reg(), mask, amount_to_shift as u32, size);
3591
// The mask will be inverted by the ANDN so non-NaN values will be all
3592
// 1s and NaN values will set the sign bit, exponent bits, and zero out
3593
// almost all of the mantissa.
3594
self.asm
3595
.xmm_vandnp_rrr(mask.to_reg(), dst.to_reg(), dst, size);
3596
}
3597
}
3598
3599