Path: blob/main/cranelift/assembler-x64/src/fuzz.rs
3070 views
//! A fuzz testing oracle for roundtrip assembly-disassembly.1//!2//! This contains manual implementations of the `Arbitrary` trait for types3//! throughout this crate to avoid depending on the `arbitrary` crate4//! unconditionally (use the `fuzz` feature instead).56use std::string::{String, ToString};7use std::vec::Vec;8use std::{format, println};910use crate::{11AmodeOffset, AmodeOffsetPlusKnownOffset, AsReg, CodeSink, DeferredTarget, Fixed, Gpr, Inst,12KnownOffset, NonRspGpr, Registers, TrapCode, Xmm,13};14use arbitrary::{Arbitrary, Result, Unstructured};15use capstone::{Capstone, arch::BuildsCapstone, arch::BuildsCapstoneSyntax, arch::x86};1617/// Take a random assembly instruction and check its encoding and18/// pretty-printing against a known-good disassembler.19///20/// # Panics21///22/// This function panics to express failure as expected by the `arbitrary`23/// fuzzer infrastructure. It may fail during assembly, disassembly, or when24/// comparing the disassembled strings.25pub fn roundtrip(inst: &Inst<FuzzRegs>) {26// Check that we can actually assemble this instruction.27let assembled = assemble(inst);28let expected = disassemble(&assembled, inst);2930// Check that our pretty-printed output matches the known-good output. Trim31// off the instruction offset first.32let expected = expected.split_once(' ').unwrap().1;33let actual = inst.to_string();34if expected != actual && expected.trim() != fix_up(&actual) {35println!("> {inst}");36println!(" debug: {inst:x?}");37println!(" assembled: {}", pretty_print_hexadecimal(&assembled));38println!(" expected (capstone): {expected}");39println!(" actual (to_string): {actual}");40assert_eq!(expected, &actual);41}42}4344/// Use this assembler to emit machine code into a byte buffer.45///46/// This will skip any traps or label registrations, but this is fine for the47/// single-instruction disassembly we're doing here.48fn assemble(inst: &Inst<FuzzRegs>) -> Vec<u8> {49let mut sink = TestCodeSink::default();50inst.encode(&mut sink);51sink.patch_labels_as_if_they_referred_to_end();52sink.buf53}5455#[derive(Default)]56struct TestCodeSink {57buf: Vec<u8>,58offsets_using_label: Vec<usize>,59}6061impl TestCodeSink {62/// References to labels, e.g. RIP-relative addressing, is stored with an63/// adjustment that takes into account the distance from the relative offset64/// to the end of the instruction, where the offset is relative to. That65/// means that to indeed make the offset relative to the end of the66/// instruction, which is what we pretend all labels are bound to, it's67/// required that this adjustment is taken into account.68///69/// This function will iterate over all labels bound to this code sink and70/// pretend the label is found at the end of the `buf`. That means that the71/// distance from the label to the end of `buf` minus 4, which is the width72/// of the offset, is added to what's already present in the encoding buffer.73///74/// This is effectively undoing the `bytes_at_end` adjustment that's part of75/// `Amode::RipRelative` addressing.76fn patch_labels_as_if_they_referred_to_end(&mut self) {77let len = i32::try_from(self.buf.len()).unwrap();78for offset in self.offsets_using_label.iter() {79let range = self.buf[*offset..].first_chunk_mut::<4>().unwrap();80let offset = i32::try_from(*offset).unwrap() + 4;81let rel_distance = len - offset;82*range = (i32::from_le_bytes(*range) + rel_distance).to_le_bytes();83}84}85}8687impl CodeSink for TestCodeSink {88fn put1(&mut self, v: u8) {89self.buf.extend_from_slice(&[v]);90}9192fn put2(&mut self, v: u16) {93self.buf.extend_from_slice(&v.to_le_bytes());94}9596fn put4(&mut self, v: u32) {97self.buf.extend_from_slice(&v.to_le_bytes());98}99100fn put8(&mut self, v: u64) {101self.buf.extend_from_slice(&v.to_le_bytes());102}103104fn add_trap(&mut self, _: TrapCode) {}105106fn use_target(&mut self, _: DeferredTarget) {107let offset = self.buf.len();108self.offsets_using_label.push(offset);109}110111fn known_offset(&self, target: KnownOffset) -> i32 {112panic!("unsupported known target {target:?}")113}114}115116/// Building a new `Capstone` each time is suboptimal (TODO).117fn disassemble(assembled: &[u8], original: &Inst<FuzzRegs>) -> String {118let cs = Capstone::new()119.x86()120.mode(x86::ArchMode::Mode64)121.syntax(x86::ArchSyntax::Att)122.detail(true)123.build()124.expect("failed to create Capstone object");125let insts = cs126.disasm_all(assembled, 0x0)127.expect("failed to disassemble");128129if insts.len() != 1 {130println!("> {original}");131println!(" debug: {original:x?}");132println!(" assembled: {}", pretty_print_hexadecimal(&assembled));133assert_eq!(insts.len(), 1, "not a single instruction");134}135136let inst = insts.first().expect("at least one instruction");137if assembled.len() != inst.len() {138println!("> {original}");139println!(" debug: {original:x?}");140println!(" assembled: {}", pretty_print_hexadecimal(&assembled));141println!(142" capstone-assembled: {}",143pretty_print_hexadecimal(inst.bytes())144);145assert_eq!(assembled.len(), inst.len(), "extra bytes not disassembled");146}147148inst.to_string()149}150151fn pretty_print_hexadecimal(hex: &[u8]) -> String {152use core::fmt::Write;153let mut s = String::with_capacity(hex.len() * 2);154for b in hex {155write!(&mut s, "{b:02X}").unwrap();156}157s158}159160/// See `replace_signed_immediates`.161macro_rules! hex_print_signed_imm {162($hex:expr, $from:ty => $to:ty) => {{163let imm = <$from>::from_str_radix($hex, 16).unwrap() as $to;164let mut simm = String::new();165if imm < 0 {166simm.push_str("-");167}168let abs = match imm.checked_abs() {169Some(i) => i,170None => <$to>::MIN,171};172if imm > -10 && imm < 10 {173simm.push_str(&format!("{:x}", abs));174} else {175simm.push_str(&format!("0x{:x}", abs));176}177simm178}};179}180181/// Replace signed immediates in the disassembly with their unsigned hexadecimal182/// equivalent. This is only necessary to match `capstone`'s complex183/// pretty-printing rules; e.g. `capstone` will:184/// - omit the `0x` prefix when printing `0x0` as `0`.185/// - omit the `0x` prefix when print small values (less than 10)186/// - print negative values as `-0x...` (signed hex) instead of `0xff...`187/// (normal hex)188/// - print `mov` immediates as base-10 instead of base-16 (?!).189fn replace_signed_immediates(dis: &str) -> alloc::borrow::Cow<'_, str> {190match dis.find('$') {191None => dis.into(),192Some(idx) => {193let (prefix, rest) = dis.split_at(idx + 1); // Skip the '$'.194let (_, rest) = chomp("-", rest); // Skip the '-' if it's there.195let (_, rest) = chomp("0x", rest); // Skip the '0x' if it's there.196let n = rest.chars().take_while(char::is_ascii_hexdigit).count();197let (hex, rest) = rest.split_at(n); // Split at next non-hex character.198let simm = if dis.starts_with("mov") {199u64::from_str_radix(hex, 16).unwrap().to_string()200} else {201match hex.len() {2021 | 2 => hex_print_signed_imm!(hex, u8 => i8),2034 => hex_print_signed_imm!(hex, u16 => i16),2048 => hex_print_signed_imm!(hex, u32 => i32),20516 => hex_print_signed_imm!(hex, u64 => i64),206_ => panic!("unexpected length for hex: {hex}"),207}208};209format!("{prefix}{simm}{rest}").into()210}211}212}213214// See `replace_signed_immediates`.215fn chomp<'a>(pat: &str, s: &'a str) -> (&'a str, &'a str) {216if s.starts_with(pat) {217s.split_at(pat.len())218} else {219("", s)220}221}222223#[test]224fn replace() {225assert_eq!(226replace_signed_immediates("andl $0xffffff9a, %r11d"),227"andl $-0x66, %r11d"228);229assert_eq!(230replace_signed_immediates("xorq $0xffffffffffffffbc, 0x7f139ecc(%r9)"),231"xorq $-0x44, 0x7f139ecc(%r9)"232);233assert_eq!(234replace_signed_immediates("subl $0x3ca77a19, -0x1a030f40(%r14)"),235"subl $0x3ca77a19, -0x1a030f40(%r14)"236);237assert_eq!(238replace_signed_immediates("movq $0xffffffff864ae103, %rsi"),239"movq $18446744071667638531, %rsi"240);241}242243/// Remove everything after the first semicolon in the disassembly and trim any244/// trailing spaces. This is necessary to remove the implicit operands we end up245/// printing for Cranelift's sake.246fn remove_after_semicolon(dis: &str) -> &str {247match dis.find(';') {248None => dis,249Some(idx) => {250let (prefix, _) = dis.split_at(idx);251prefix.trim()252}253}254}255256#[test]257fn remove_after_parenthesis_test() {258assert_eq!(259remove_after_semicolon("imulb 0x7658eddd(%rcx) ;; implicit: %ax"),260"imulb 0x7658eddd(%rcx)"261);262}263264/// Run some post-processing on the disassembly to make it match Capstone.265fn fix_up(dis: &str) -> alloc::borrow::Cow<'_, str> {266let dis = remove_after_semicolon(dis);267replace_signed_immediates(&dis)268}269270/// Fuzz-specific registers.271///272/// For the fuzzer, we do not need any fancy register types; see [`FuzzReg`].273#[derive(Clone, Arbitrary, Debug)]274pub struct FuzzRegs;275276impl Registers for FuzzRegs {277type ReadGpr = FuzzReg;278type ReadWriteGpr = FuzzReg;279type WriteGpr = FuzzReg;280type ReadXmm = FuzzReg;281type ReadWriteXmm = FuzzReg;282type WriteXmm = FuzzReg;283}284285/// A simple `u8` register type for fuzzing only.286#[derive(Clone, Copy, Debug, PartialEq)]287pub struct FuzzReg(u8);288289impl<'a> Arbitrary<'a> for FuzzReg {290fn arbitrary(u: &mut arbitrary::Unstructured<'a>) -> arbitrary::Result<Self> {291Ok(Self(u.int_in_range(0..=15)?))292}293}294295impl AsReg for FuzzReg {296fn new(enc: u8) -> Self {297Self(enc)298}299fn enc(&self) -> u8 {300self.0301}302}303304impl Arbitrary<'_> for AmodeOffset {305fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {306// Custom implementation to try to generate some "interesting" offsets.307// For example choose either an arbitrary 8-bit or 32-bit number as the308// base, and then optionally shift that number to the left to create309// multiples of constants. This can help stress some of the more310// interesting encodings in EVEX instructions for example.311let base = if u.arbitrary()? {312i32::from(u.arbitrary::<i8>()?)313} else {314u.arbitrary::<i32>()?315};316Ok(match u.int_in_range(0..=5)? {3170 => AmodeOffset::ZERO,318n => AmodeOffset::new(base << (n - 1)),319})320}321}322323impl Arbitrary<'_> for AmodeOffsetPlusKnownOffset {324fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {325// For now, we don't generate offsets (TODO).326Ok(Self {327simm32: AmodeOffset::arbitrary(u)?,328offset: None,329})330}331}332333impl<R: AsReg, const E: u8> Arbitrary<'_> for Fixed<R, E> {334fn arbitrary(_: &mut Unstructured<'_>) -> Result<Self> {335Ok(Self::new(E))336}337}338339impl<R: AsReg> Arbitrary<'_> for NonRspGpr<R> {340fn arbitrary(u: &mut Unstructured<'_>) -> Result<Self> {341use crate::gpr::enc::*;342let gpr = u.choose(&[343RAX, RCX, RDX, RBX, RBP, RSI, RDI, R8, R9, R10, R11, R12, R13, R14, R15,344])?;345Ok(Self::new(R::new(*gpr)))346}347}348impl<'a, R: AsReg> Arbitrary<'a> for Gpr<R> {349fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {350Ok(Self(R::new(u.int_in_range(0..=15)?)))351}352}353impl<'a, R: AsReg> Arbitrary<'a> for Xmm<R> {354fn arbitrary(u: &mut Unstructured<'a>) -> Result<Self> {355Ok(Self(R::new(u.int_in_range(0..=15)?)))356}357}358359/// Helper trait that's used to be the same as `Registers` except with an extra360/// `for<'a> Arbitrary<'a>` bound on all of the associated types.361pub trait RegistersArbitrary:362Registers<363ReadGpr: for<'a> Arbitrary<'a>,364ReadWriteGpr: for<'a> Arbitrary<'a>,365WriteGpr: for<'a> Arbitrary<'a>,366ReadXmm: for<'a> Arbitrary<'a>,367ReadWriteXmm: for<'a> Arbitrary<'a>,368WriteXmm: for<'a> Arbitrary<'a>,369>370{371}372373impl<R> RegistersArbitrary for R374where375R: Registers,376R::ReadGpr: for<'a> Arbitrary<'a>,377R::ReadWriteGpr: for<'a> Arbitrary<'a>,378R::WriteGpr: for<'a> Arbitrary<'a>,379R::ReadXmm: for<'a> Arbitrary<'a>,380R::ReadWriteXmm: for<'a> Arbitrary<'a>,381R::WriteXmm: for<'a> Arbitrary<'a>,382{383}384385#[cfg(test)]386mod test {387use super::*;388use arbtest::arbtest;389use std::sync::atomic::{AtomicUsize, Ordering};390391#[test]392fn smoke() {393let count = AtomicUsize::new(0);394arbtest(|u| {395let inst: Inst<FuzzRegs> = u.arbitrary()?;396roundtrip(&inst);397println!("#{}: {inst}", count.fetch_add(1, Ordering::SeqCst));398Ok(())399})400.budget_ms(1_000);401402// This will run the `roundtrip` fuzzer for one second. To repeatably403// test a single input, append `.seed(0x<failing seed>)`.404}405406#[test]407fn callq() {408for i in -500..500 {409println!("immediate: {i}");410let inst = crate::inst::callq_d::new(i);411roundtrip(&inst.into());412}413}414}415416417