Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
bytecodealliance
GitHub Repository: bytecodealliance/wasmtime
Path: blob/main/cranelift/assembler-x64/meta/src/instructions/lanes.rs
1693 views
1
use crate::dsl::{Feature::*, Inst, Length::*, Location::*, TupleType::*};
2
use crate::dsl::{align, evex, fmt, inst, r, rex, rw, vex, w};
3
4
#[rustfmt::skip] // Keeps instructions on a single line.
5
pub fn list() -> Vec<Inst> {
6
// Note that `p{extr,ins}r{w,b}` below operate on 32-bit registers but a
7
// smaller-width memory location. This means that disassembly in Capstone
8
// doesn't match `rm8`, for example. For now pretend both of these are
9
// `rm32` to get disassembly matching Capstone.
10
let r32m8 = rm32;
11
let r32m16 = rm32;
12
13
vec![
14
// Extract from a single XMM lane.
15
inst("extractps", fmt("A", [w(rm32), r(xmm1), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x17]).r().ib(), (_64b | compat) & sse41).alt(avx, "vextractps_b"),
16
inst("pextrb", fmt("A", [w(r32m8), r(xmm2), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x14]).r().ib(), (_64b | compat) & sse41).alt(avx, "vpextrb_a"),
17
inst("pextrw", fmt("A", [w(r32), r(xmm2), r(imm8)]), rex([0x66, 0x0F, 0xC5]).r().ib(), (_64b | compat) & sse2).alt(avx, "vpextrw_a"),
18
inst("pextrw", fmt("B", [w(r32m16), r(xmm2), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x15]).r().ib(), (_64b | compat) & sse41).alt(avx, "vpextrw_b"),
19
inst("pextrd", fmt("A", [w(rm32), r(xmm2), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x16]).r().ib(), (_64b | compat) & sse41).alt(avx, "vpextrd_a"),
20
inst("pextrq", fmt("A", [w(rm64), r(xmm2), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x16]).w().r().ib(), _64b & sse41).alt(avx, "vpextrq_a"),
21
inst("vextractps", fmt("B", [w(rm32), r(xmm1), r(imm8)]), vex(L128)._66()._0f3a().wig().op(0x17).r().ib(), (_64b | compat) & avx),
22
inst("vpextrb", fmt("A", [w(r32m8), r(xmm2), r(imm8)]), vex(L128)._66()._0f3a().w0().op(0x14).r().ib(), (_64b | compat) & avx),
23
inst("vpextrw", fmt("A", [w(r32), r(xmm2), r(imm8)]), vex(L128)._66()._0f().w0().op(0xC5).r().ib(), (_64b | compat) & avx),
24
inst("vpextrw", fmt("B", [w(r32m16), r(xmm2), r(imm8)]), vex(L128)._66()._0f3a().w0().op(0x15).r().ib(), (_64b | compat) & avx),
25
inst("vpextrd", fmt("A", [w(rm32), r(xmm2), r(imm8)]), vex(L128)._66()._0f3a().w0().op(0x16).r().ib(), (_64b | compat) & avx),
26
inst("vpextrq", fmt("A", [w(rm64), r(xmm2), r(imm8)]), vex(L128)._66()._0f3a().w1().op(0x16).r().ib(), (_64b | compat) & avx),
27
28
// Insert into a single XMM lane.
29
inst("insertps", fmt("A", [rw(xmm1), r(xmm_m32), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x21]).r().ib(), (_64b | compat) & sse41).alt(avx, "vinsertps_b"),
30
inst("pinsrb", fmt("A", [rw(xmm1), r(r32m8), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x20]).r().ib(), (_64b | compat) & sse41),
31
inst("pinsrw", fmt("A", [rw(xmm1), r(r32m16), r(imm8)]), rex([0x66, 0x0F, 0xC4]).r().ib(), (_64b | compat) & sse2),
32
inst("pinsrd", fmt("A", [rw(xmm1), r(rm32), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x22]).r().ib(), (_64b | compat) & sse41),
33
inst("pinsrq", fmt("A", [rw(xmm1), r(rm64), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x22]).r().ib().w(), _64b & sse41),
34
inst("vinsertps", fmt("B", [w(xmm1), r(xmm2), r(xmm_m32), r(imm8)]), vex(L128)._66()._0f3a().wig().op(0x21).r().ib(), (_64b | compat) & avx),
35
inst("vpinsrb", fmt("B", [w(xmm1), r(xmm2), r(r32m8), r(imm8)]), vex(L128)._66()._0f3a().w0().op(0x20).r().ib(), (_64b | compat) & avx),
36
inst("vpinsrw", fmt("B", [w(xmm1), r(xmm2), r(r32m16), r(imm8)]), vex(L128)._66()._0f().w0().op(0xC4).r().ib(), (_64b | compat) & avx),
37
inst("vpinsrd", fmt("B", [w(xmm1), r(xmm2), r(rm32), r(imm8)]), vex(L128)._66()._0f3a().w0().op(0x22).r().ib(), (_64b | compat) & avx),
38
inst("vpinsrq", fmt("B", [w(xmm1), r(xmm2), r(rm64), r(imm8)]), vex(L128)._66()._0f3a().w1().op(0x22).r().ib(), _64b & avx),
39
40
// Extract sign masks from the floating-point lanes.
41
inst("movmskps", fmt("RM", [w(r32), r(xmm2)]), rex([0x0F, 0x50]).r(), (_64b | compat) & sse).alt(avx, "vmovmskps_rm"),
42
inst("movmskpd", fmt("RM", [w(r32), r(xmm2)]), rex([0x66, 0x0F, 0x50]).r(), (_64b | compat) & sse2).alt(avx, "vmovmskpd_rm"),
43
inst("pmovmskb", fmt("RM", [w(r32), r(xmm2)]), rex([0x66, 0x0F, 0xD7]).r(), (_64b | compat) & sse2).alt(avx, "vpmovmskb_rm"),
44
inst("vmovmskps", fmt("RM", [w(r32), r(xmm2)]), vex(L128)._0f().op(0x50).r(), (_64b | compat) & avx),
45
inst("vmovmskpd", fmt("RM", [w(r32), r(xmm2)]), vex(L128)._66()._0f().op(0x50).r(), (_64b | compat) & avx),
46
inst("vpmovmskb", fmt("RM", [w(r32), r(xmm2)]), vex(L128)._66()._0f().op(0xD7).r(), (_64b | compat) & avx),
47
48
// Move two lower 32-bit floats to the high two lanes.
49
inst("movhps", fmt("A", [rw(xmm1), r(m64)]), rex([0x0F, 0x16]).r(), (_64b | compat) & sse).alt(avx, "vmovhps_b"),
50
inst("movlhps", fmt("RM", [rw(xmm1), r(xmm2)]), rex([0x0F, 0x16]).r(), (_64b | compat) & sse).alt(avx, "vmovlhps_rvm"),
51
inst("vmovhps", fmt("B", [w(xmm2), r(xmm1), r(m64)]), vex(L128)._0f().op(0x16).r(), (_64b | compat) & avx),
52
inst("vmovlhps", fmt("RVM", [w(xmm1), r(xmm2), r(xmm3)]), vex(L128)._0f().op(0x16).r(), (_64b | compat) & avx),
53
54
// Duplicate the lower 64 bits of the source into 128 bits of the destination.
55
inst("movddup", fmt("A", [w(xmm1), r(xmm_m64)]), rex([0xF2, 0x0F, 0x12]).r(), (_64b | compat) & sse3).alt(avx, "vmovddup_a"),
56
inst("vmovddup", fmt("A", [w(xmm1), r(xmm_m64)]), vex(L128)._f2()._0f().op(0x12).r(), (_64b | compat) & avx),
57
58
// Blend lanes in various ways.
59
inst("pblendw", fmt("RMI", [rw(xmm1), r(align(xmm_m128)), r(imm8)]), rex([0x66, 0x0F, 0x3A, 0x0E]).r().ib(), (_64b | compat) & sse41).alt(avx, "vpblendw_rvmi"),
60
inst("pblendvb", fmt("RM", [rw(xmm1), r(align(xmm_m128)), r(xmm0)]), rex([0x66, 0x0F, 0x38, 0x10]).r(), (_64b | compat) & sse41),
61
inst("blendvps", fmt("RM0", [rw(xmm1), r(align(xmm_m128)), r(xmm0)]), rex([0x66, 0x0F, 0x38, 0x14]).r(), (_64b | compat) & sse41),
62
inst("blendvpd", fmt("RM0", [rw(xmm1), r(align(xmm_m128)), r(xmm0)]), rex([0x66, 0x0F, 0x38, 0x15]).r(), (_64b | compat) & sse41),
63
inst("vpblendw", fmt("RVMI", [w(xmm1), r(xmm2), r(xmm_m128), r(imm8)]), vex(L128)._66()._0f3a().w0().op(0x0E).r().ib(), (_64b | compat) & avx),
64
inst("vpblendvb", fmt("RVMR", [w(xmm1), r(xmm2), r(xmm_m128), r(xmm3)]), vex(L128)._66()._0f3a().w0().op(0x4C).r().is4(), (_64b | compat) & avx),
65
inst("vblendvps", fmt("RVMR", [w(xmm1), r(xmm2), r(xmm_m128), r(xmm3)]), vex(L128)._66()._0f3a().w0().op(0x4A).r().is4(), (_64b | compat) & avx),
66
inst("vblendvpd", fmt("RVMR", [w(xmm1), r(xmm2), r(xmm_m128), r(xmm3)]), vex(L128)._66()._0f3a().w0().op(0x4B).r().is4(), (_64b | compat) & avx),
67
68
// Shuffle lanes in various ways.
69
inst("shufpd", fmt("A", [rw(xmm1), r(align(xmm_m128)), r(imm8)]), rex([0x66, 0x0F, 0xC6]).ib(), (_64b | compat) & sse2).alt(avx, "vshufpd_b"),
70
inst("vshufpd", fmt("B", [w(xmm1), r(xmm2), r(xmm_m128), r(imm8)]), vex(L128)._66()._0f().ib().op(0xC6), (_64b | compat) & avx),
71
inst("shufps", fmt("A", [rw(xmm1), r(align(xmm_m128)), r(imm8)]), rex([0x0F, 0xC6]).ib(), (_64b | compat) & sse).alt(avx, "vshufps_b"),
72
inst("vshufps", fmt("B", [w(xmm1), r(xmm2), r(xmm_m128), r(imm8)]), vex(L128)._0f().ib().op(0xC6), (_64b | compat) & avx),
73
inst("pshufb", fmt("A", [rw(xmm1), r(align(xmm_m128))]), rex([0x66, 0x0F, 0x38, 0x00]), (_64b | compat) & ssse3).alt(avx, "vpshufb_b"),
74
inst("pshufd", fmt("A", [w(xmm1), r(align(xmm_m128)), r(imm8)]), rex([0x66, 0x0F, 0x70]).r().ib(), (_64b | compat) & sse2).alt(avx, "vpshufd_a"),
75
inst("pshuflw", fmt("A", [w(xmm1), r(align(xmm_m128)), r(imm8)]), rex([0xF2, 0x0F, 0x70]).r().ib(), (_64b | compat) & sse2).alt(avx, "vpshuflw_a"),
76
inst("pshufhw", fmt("A", [w(xmm1), r(align(xmm_m128)), r(imm8)]), rex([0xF3, 0x0F, 0x70]).r().ib(), (_64b | compat) & sse2).alt(avx, "vpshufhw_a"),
77
inst("vpshufb", fmt("B", [w(xmm1), r(xmm2), r(xmm_m128)]), vex(L128)._66()._0f38().op(0x00), (_64b | compat) & avx),
78
inst("vpshufd", fmt("A", [w(xmm1), r(xmm_m128), r(imm8)]), vex(L128)._66()._0f().op(0x70).r().ib(), (_64b | compat) & avx),
79
inst("vpshuflw", fmt("A", [w(xmm1), r(xmm_m128), r(imm8)]), vex(L128)._f2()._0f().op(0x70).r().ib(), (_64b | compat) & avx),
80
inst("vpshufhw", fmt("A", [w(xmm1), r(xmm_m128), r(imm8)]), vex(L128)._f3()._0f().op(0x70).r().ib(), (_64b | compat) & avx),
81
82
// Broadcast a single lane to all lanes of the destination.
83
inst("vbroadcastss", fmt("A_M", [w(xmm1), r(m32)]), vex(L128)._66()._0f38().w0().op(0x18).r(), (_64b | compat) & avx),
84
inst("vbroadcastss", fmt("A_R", [w(xmm1), r(xmm2)]), vex(L128)._66()._0f38().w0().op(0x18).r(), (_64b | compat) & avx2),
85
inst("vpbroadcastb", fmt("A", [w(xmm1), r(xmm_m8)]), vex(L128)._66()._0f38().w0().op(0x78).r(), (_64b | compat) & avx2),
86
inst("vpbroadcastw", fmt("A", [w(xmm1), r(xmm_m16)]), vex(L128)._66()._0f38().w0().op(0x79).r(), (_64b | compat) & avx2),
87
inst("vpbroadcastd", fmt("A", [w(xmm1), r(xmm_m32)]), vex(L128)._66()._0f38().w0().op(0x58).r(), (_64b | compat) & avx2),
88
inst("vpbroadcastq", fmt("A", [w(xmm1), r(xmm_m64)]), vex(L128)._66()._0f38().w0().op(0x59).r(), (_64b | compat) & avx2),
89
90
// AVX-512 permutations
91
inst("vpermi2b", fmt("A", [rw(xmm1), r(xmm2), r(xmm_m128)]), evex(L128, FullMem)._66()._0f38().w0().op(0x75).r(), (_64b | compat) & avx512vl & avx512vbmi),
92
]
93
}
94
95