Path: blob/main/cranelift/assembler-x64/src/fuzz.rs
1693 views
//! A fuzz testing oracle for roundtrip assembly-disassembly.1//!2//! This contains manual implementations of the `Arbitrary` trait for types3//! throughout this crate to avoid depending on the `arbitrary` crate4//! unconditionally (use the `fuzz` feature instead).56use crate::{7AmodeOffset, AmodeOffsetPlusKnownOffset, AsReg, CodeSink, DeferredTarget, Fixed, Gpr, Inst,8KnownOffset, NonRspGpr, Registers, TrapCode, Xmm,9};10use arbitrary::{Arbitrary, Result, Unstructured};11use capstone::{Capstone, arch::BuildsCapstone, arch::BuildsCapstoneSyntax, arch::x86};1213/// Take a random assembly instruction and check its encoding and14/// pretty-printing against a known-good disassembler.15///16/// # Panics17///18/// This function panics to express failure as expected by the `arbitrary`19/// fuzzer infrastructure. It may fail during assembly, disassembly, or when20/// comparing the disassembled strings.21pub fn roundtrip(inst: &Inst<FuzzRegs>) {22// Check that we can actually assemble this instruction.23let assembled = assemble(inst);24let expected = disassemble(&assembled, inst);2526// Check that our pretty-printed output matches the known-good output. Trim27// off the instruction offset first.28let expected = expected.split_once(' ').unwrap().1;29let actual = inst.to_string();30if expected != actual && expected.trim() != fix_up(&actual) {31println!("> {inst}");32println!(" debug: {inst:x?}");33println!(" assembled: {}", pretty_print_hexadecimal(&assembled));34println!(" expected (capstone): {expected}");35println!(" actual (to_string): {actual}");36assert_eq!(expected, &actual);37}38}3940/// Use this assembler to emit machine code into a byte buffer.41///42/// This will skip any traps or label registrations, but this is fine for the43/// single-instruction disassembly we're doing here.44fn assemble(inst: &Inst<FuzzRegs>) -> Vec<u8> {45let mut sink = TestCodeSink::default();46inst.encode(&mut sink);47sink.patch_labels_as_if_they_referred_to_end();48sink.buf49}5051#[derive(Default)]52struct TestCodeSink {53buf: Vec<u8>,54offsets_using_label: Vec<usize>,55}5657impl TestCodeSink {58/// References to labels, e.g. RIP-relative addressing, is stored with an59/// adjustment that takes into account the distance from the relative offset60/// to the end of the instruction, where the offset is relative to. That61/// means that to indeed make the offset relative to the end of the62/// instruction, which is what we pretend all labels are bound to, it's63/// required that this adjustment is taken into account.64///65/// This function will iterate over all labels bound to this code sink and66/// pretend the label is found at the end of the `buf`. That means that the67/// distance from the label to the end of `buf` minus 4, which is the width68/// of the offset, is added to what's already present in the encoding buffer.69///70/// This is effectively undoing the `bytes_at_end` adjustment that's part of71/// `Amode::RipRelative` addressing.72fn patch_labels_as_if_they_referred_to_end(&mut self) {73let len = i32::try_from(self.buf.len()).unwrap();74for offset in self.offsets_using_label.iter() {75let range = self.buf[*offset..].first_chunk_mut::<4>().unwrap();76let offset = i32::try_from(*offset).unwrap() + 4;77let rel_distance = len - offset;78*range = (i32::from_le_bytes(*range) + rel_distance).to_le_bytes();79}80}81}8283impl CodeSink for TestCodeSink {84fn put1(&mut self, v: u8) {85self.buf.extend_from_slice(&[v]);86}8788fn put2(&mut self, v: u16) {89self.buf.extend_from_slice(&v.to_le_bytes());90}9192fn put4(&mut self, v: u32) {93self.buf.extend_from_slice(&v.to_le_bytes());94}9596fn put8(&mut self, v: u64) {97self.buf.extend_from_slice(&v.to_le_bytes());98}99100fn add_trap(&mut self, _: TrapCode) {}101102fn use_target(&mut self, _: DeferredTarget) {103let offset = self.buf.len();104self.offsets_using_label.push(offset);105}106107fn known_offset(&self, target: KnownOffset) -> i32 {108panic!("unsupported known target {target:?}")109}110}111112/// Building a new `Capstone` each time is suboptimal (TODO).113fn disassemble(assembled: &[u8], original: &Inst<FuzzRegs>) -> String {114let cs = Capstone::new()115.x86()116.mode(x86::ArchMode::Mode64)117.syntax(x86::ArchSyntax::Att)118.detail(true)119.build()120.expect("failed to create Capstone object");121let insts = cs122.disasm_all(assembled, 0x0)123.expect("failed to disassemble");124125if insts.len() != 1 {126println!("> {original}");127println!(" debug: {original:x?}");128println!(" assembled: {}", pretty_print_hexadecimal(&assembled));129assert_eq!(insts.len(), 1, "not a single instruction");130}131132let inst = insts.first().expect("at least one instruction");133if assembled.len() != inst.len() {134println!("> {original}");135println!(" debug: {original:x?}");136println!(" assembled: {}", pretty_print_hexadecimal(&assembled));137println!(138" capstone-assembled: {}",139pretty_print_hexadecimal(inst.bytes())140);141assert_eq!(assembled.len(), inst.len(), "extra bytes not disassembled");142}143144inst.to_string()145}146147fn pretty_print_hexadecimal(hex: &[u8]) -> String {148use std::fmt::Write;149let mut s = String::with_capacity(hex.len() * 2);150for b in hex {151write!(&mut s, "{b:02X}").unwrap();152}153s154}155156/// See `replace_signed_immediates`.157macro_rules! hex_print_signed_imm {158($hex:expr, $from:ty => $to:ty) => {{159let imm = <$from>::from_str_radix($hex, 16).unwrap() as $to;160let mut simm = String::new();161if imm < 0 {162simm.push_str("-");163}164let abs = match imm.checked_abs() {165Some(i) => i,166None => <$to>::MIN,167};168if imm > -10 && imm < 10 {169simm.push_str(&format!("{:x}", abs));170} else {171simm.push_str(&format!("0x{:x}", abs));172}173simm174}};175}176177/// Replace signed immediates in the disassembly with their unsigned hexadecimal178/// equivalent. This is only necessary to match `capstone`'s complex179/// pretty-printing rules; e.g. `capstone` will:180/// - omit the `0x` prefix when printing `0x0` as `0`.181/// - omit the `0x` prefix when print small values (less than 10)182/// - print negative values as `-0x...` (signed hex) instead of `0xff...`183/// (normal hex)184/// - print `mov` immediates as base-10 instead of base-16 (?!).185fn replace_signed_immediates(dis: &str) -> std::borrow::Cow<'_, str> {186match dis.find('$') {187None => dis.into(),188Some(idx) => {189let (prefix, rest) = dis.split_at(idx + 1); // Skip the '$'.190let (_, rest) = chomp("-", rest); // Skip the '-' if it's there.191let (_, rest) = chomp("0x", rest); // Skip the '0x' if it's there.192let n = rest.chars().take_while(char::is_ascii_hexdigit).count();193let (hex, rest) = rest.split_at(n); // Split at next non-hex character.194let simm = if dis.starts_with("mov") {195u64::from_str_radix(hex, 16).unwrap().to_string()196} else {197match hex.len() {1981 | 2 => hex_print_signed_imm!(hex, u8 => i8),1994 => hex_print_signed_imm!(hex, u16 => i16),2008 => hex_print_signed_imm!(hex, u32 => i32),20116 => hex_print_signed_imm!(hex, u64 => i64),202_ => panic!("unexpected length for hex: {hex}"),203}204};205format!("{prefix}{simm}{rest}").into()206}207}208}209210// See `replace_signed_immediates`.211fn chomp<'a>(pat: &str, s: &'a str) -> (&'a str, &'a str) {212if s.starts_with(pat) {213s.split_at(pat.len())214} else {215("", s)216}217}218219#[test]220fn replace() {221assert_eq!(222replace_signed_immediates("andl $0xffffff9a, %r11d"),223"andl $-0x66, %r11d"224);225assert_eq!(226replace_signed_immediates("xorq $0xffffffffffffffbc, 0x7f139ecc(%r9)"),227"xorq $-0x44, 0x7f139ecc(%r9)"228);229assert_eq!(230replace_signed_immediates("subl $0x3ca77a19, -0x1a030f40(%r14)"),231"subl $0x3ca77a19, -0x1a030f40(%r14)"232);233assert_eq!(234replace_signed_immediates("movq $0xffffffff864ae103, %rsi"),235"movq $18446744071667638531, %rsi"236);237}238239/// Remove everything after the first semicolon in the disassembly and trim any240/// trailing spaces. This is necessary to remove the implicit operands we end up241/// printing for Cranelift's sake.242fn remove_after_semicolon(dis: &str) -> &str {243match dis.find(';') {244None => dis,245Some(idx) => {246let (prefix, _) = dis.split_at(idx);247prefix.trim()248}249}250}251252#[test]253fn remove_after_parenthesis_test() {254assert_eq!(255remove_after_semicolon("imulb 0x7658eddd(%rcx) ;; implicit: %ax"),256"imulb 0x7658eddd(%rcx)"257);258}259260/// Run some post-processing on the disassembly to make it match Capstone.261fn fix_up(dis: &str) -> std::borrow::Cow<'_, str> {262let dis = remove_after_semicolon(dis);263replace_signed_immediates(&dis)264}265266/// Fuzz-specific registers.267///268/// For the fuzzer, we do not need any fancy register types; see [`FuzzReg`].269#[derive(Clone, Arbitrary, Debug)]270pub struct FuzzRegs;271272impl Registers for FuzzRegs {273type ReadGpr = FuzzReg;274type ReadWriteGpr = FuzzReg;275type WriteGpr = FuzzReg;276type ReadXmm = FuzzReg;277type ReadWriteXmm = FuzzReg;278type WriteXmm = FuzzReg;279}280281/// A simple `u8` register type for fuzzing only.282#[derive(Clone, Copy, Debug, PartialEq)]283pub struct FuzzReg(u8);284285impl<'a> Arbitrary<'a> for FuzzReg {286fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {287Ok(Self(u.int_in_range(0..=15)?))288}289}290291impl AsReg for FuzzReg {292fn new(enc: u8) -> Self {293Self(enc)294}295fn enc(&self) -> u8 {296self.0297}298}299300impl Arbitrary<'_> for AmodeOffset {301fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {302// Custom implementation to try to generate some "interesting" offsets.303// For example choose either an arbitrary 8-bit or 32-bit number as the304// base, and then optionally shift that number to the left to create305// multiples of constants. This can help stress some of the more306// interesting encodings in EVEX instructions for example.307let base = if u.arbitrary()? {308i32::from(u.arbitrary::<i8>()?)309} else {310u.arbitrary::<i32>()?311};312Ok(match u.int_in_range(0..=5)? {3130 => AmodeOffset::ZERO,314n => AmodeOffset::new(base << (n - 1)),315})316}317}318319impl Arbitrary<'_> for AmodeOffsetPlusKnownOffset {320fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {321// For now, we don't generate offsets (TODO).322Ok(Self {323simm32: AmodeOffset::arbitrary(u)?,324offset: None,325})326}327}328329impl<R: AsReg, const E: u8> Arbitrary<'_> for Fixed<R, E> {330fn arbitrary(_: &mut Unstructured<'_>) -> Result<Self> {331Ok(Self::new(E))332}333}334335impl<R: AsReg> Arbitrary<'_> for NonRspGpr<R> {336fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {337use crate::gpr::enc::*;338let gpr = u.choose(&[339RAX, RCX, RDX, RBX, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,340])?;341Ok(Self::new(R::new(*gpr)))342}343}344impl<'a, R: AsReg> Arbitrary<'a> for Gpr<R> {345fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {346Ok(Self(R::new(u.int_in_range(0..=15)?)))347}348}349impl<'a, R: AsReg> Arbitrary<'a> for Xmm<R> {350fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {351Ok(Self(R::new(u.int_in_range(0..=15)?)))352}353}354355/// Helper trait that's used to be the same as `Registers` except with an extra356/// `for<'a> Arbitrary<'a>` bound on all of the associated types.357pub trait RegistersArbitrary:358Registers<359ReadGpr: for<'a> Arbitrary<'a>,360ReadWriteGpr: for<'a> Arbitrary<'a>,361WriteGpr: for<'a> Arbitrary<'a>,362ReadXmm: for<'a> Arbitrary<'a>,363ReadWriteXmm: for<'a> Arbitrary<'a>,364WriteXmm: for<'a> Arbitrary<'a>,365>366{367}368369impl<R> RegistersArbitrary for R370where371R: Registers,372R::ReadGpr: for<'a> Arbitrary<'a>,373R::ReadWriteGpr: for<'a> Arbitrary<'a>,374R::WriteGpr: for<'a> Arbitrary<'a>,375R::ReadXmm: for<'a> Arbitrary<'a>,376R::ReadWriteXmm: for<'a> Arbitrary<'a>,377R::WriteXmm: for<'a> Arbitrary<'a>,378{379}380381#[cfg(test)]382mod test {383use super::*;384use arbtest::arbtest;385use std::sync::atomic::{AtomicUsize, Ordering};386387#[test]388fn smoke() {389let count = AtomicUsize::new(0);390arbtest(|u| {391let inst: Inst<FuzzRegs> = u.arbitrary()?;392roundtrip(&inst);393println!("#{}: {inst}", count.fetch_add(1, Ordering::SeqCst));394Ok(())395})396.budget_ms(1_000);397398// This will run the `roundtrip` fuzzer for one second. To repeatably399// test a single input, append `.seed(0x<failing seed>)`.400}401402#[test]403fn callq() {404for i in -500..500 {405println!("immediate: {i}");406let inst = crate::inst::callq_d::new(i);407roundtrip(&inst.into());408}409}410}411412413