Path: blob/main/cranelift/reader/src/lexer.rs
2450 views
//! Lexical analysis for .clif files.12use crate::error::Location;3use cranelift_codegen::ir::types;4use cranelift_codegen::ir::{Block, Value};5use std::str::CharIndices;6use std::u16;78/// A Token returned from the `Lexer`.9///10/// Some variants may contains references to the original source text, so the `Token` has the same11/// lifetime as the source.12#[derive(Debug, PartialEq, Eq, Clone, Copy)]13pub enum Token<'a> {14Comment(&'a str),15LPar, // '('16RPar, // ')'17LBrace, // '{'18RBrace, // '}'19LBracket, // '['20RBracket, // ']'21LAngle, // '<'22RAngle, // '>'23Minus, // '-'24Plus, // '+'25Multiply, // '*'26Comma, // ','27Dot, // '.'28Colon, // ':'29Equal, // '='30Bang, // '!'31At, // '@'32Arrow, // '->'33Float(&'a str), // Floating point immediate34Integer(&'a str), // Integer immediate35Type(types::Type), // i32, f32, i32x4, ...36DynamicType(u32), // dt537Value(Value), // v12, v738Block(Block), // block339Cold, // cold (flag on block)40StackSlot(u32), // ss341DynamicStackSlot(u32), // dss442GlobalValue(u32), // gv343MemoryType(u32), // mt044Constant(u32), // const245FuncRef(u32), // fn246SigRef(u32), // sig247UserRef(u32), // u34548UserNameRef(u32), // userextname34549ExceptionTableRef(u32), // ex12350ExceptionTag(u32), // tag12351TryCallRet(u32), // ret12352TryCallExn(u32), // exn12353Name(&'a str), // %9arbitrary_alphanum, %x3, %0, %function ...54String(&'a str), // "arbitrary quoted string with no escape" ...55HexSequence(&'a str), // #89AF56Identifier(&'a str), // Unrecognized identifier (opcode, enumerator, ...)57SourceLoc(&'a str), // @00c758}5960/// A `Token` with an associated location.61#[derive(Debug, PartialEq, Eq)]62pub struct LocatedToken<'a> {63pub token: Token<'a>,64pub location: Location,65}6667/// Wrap up a `Token` with the given location.68fn token(token: Token, loc: Location) -> Result<LocatedToken, LocatedError> {69Ok(LocatedToken {70token,71location: loc,72})73}7475/// An error from the lexical analysis.76#[derive(Debug, Clone, Copy, PartialEq, Eq)]77pub enum LexError {78InvalidChar,79}8081/// A `LexError` with an associated Location.82#[derive(Debug, Clone, Copy, PartialEq, Eq)]83pub struct LocatedError {84pub error: LexError,85pub location: Location,86}8788/// Wrap up a `LexError` with the given location.89fn error<'a>(error: LexError, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {90Err(LocatedError {91error,92location: loc,93})94}9596/// Get the number of decimal digits at the end of `s`.97fn trailing_digits(s: &str) -> usize {98// It's faster to iterate backwards over bytes, and we're only counting ASCII digits.99s.as_bytes()100.iter()101.rev()102.take_while(|&&b| b'0' <= b && b <= b'9')103.count()104}105106/// Pre-parse a supposed entity name by splitting it into two parts: A head of lowercase ASCII107/// letters and numeric tail.108pub fn split_entity_name(name: &str) -> Option<(&str, u32)> {109let (head, tail) = name.split_at(name.len() - trailing_digits(name));110if tail.len() > 1 && tail.starts_with('0') {111None112} else {113tail.parse().ok().map(|n| (head, n))114}115}116117/// Lexical analysis.118///119/// A `Lexer` reads text from a `&str` and provides a sequence of tokens.120///121/// Also keep track of a line number for error reporting.122///123pub struct Lexer<'a> {124// Complete source being processed.125source: &'a str,126127// Iterator into `source`.128chars: CharIndices<'a>,129130// Next character to be processed, or `None` at the end.131lookahead: Option<char>,132133// Index into `source` of lookahead character.134pos: usize,135136// Current line number.137line_number: usize,138}139140impl<'a> Lexer<'a> {141pub fn new(s: &'a str) -> Self {142let mut lex = Self {143source: s,144chars: s.char_indices(),145lookahead: None,146pos: 0,147line_number: 1,148};149// Advance to the first char.150lex.next_ch();151lex152}153154// Advance to the next character.155// Return the next lookahead character, or None when the end is encountered.156// Always update cur_ch to reflect157fn next_ch(&mut self) -> Option<char> {158if self.lookahead == Some('\n') {159self.line_number += 1;160}161match self.chars.next() {162Some((idx, ch)) => {163self.pos = idx;164self.lookahead = Some(ch);165}166None => {167self.pos = self.source.len();168self.lookahead = None;169}170}171self.lookahead172}173174// Get the location corresponding to `lookahead`.175fn loc(&self) -> Location {176Location {177line_number: self.line_number,178}179}180181// Starting from `lookahead`, are we looking at `prefix`?182fn looking_at(&self, prefix: &str) -> bool {183self.source[self.pos..].starts_with(prefix)184}185186// Starting from `lookahead`, are we looking at a number?187fn looking_at_numeric(&self) -> bool {188if let Some(c) = self.lookahead {189match c {190'0'..='9' => return true,191'-' => return true,192'+' => return true,193'.' => return true,194_ => {}195}196if self.looking_at("NaN") || self.looking_at("Inf") || self.looking_at("sNaN") {197return true;198}199}200false201}202203// Scan a single-char token.204fn scan_char(&mut self, tok: Token<'a>) -> Result<LocatedToken<'a>, LocatedError> {205assert_ne!(self.lookahead, None);206let loc = self.loc();207self.next_ch();208token(tok, loc)209}210211// Scan a multi-char token.212fn scan_chars(213&mut self,214count: usize,215tok: Token<'a>,216) -> Result<LocatedToken<'a>, LocatedError> {217let loc = self.loc();218for _ in 0..count {219assert_ne!(self.lookahead, None);220self.next_ch();221}222token(tok, loc)223}224225/// Get the rest of the current line.226/// The next token returned by `next()` will be from the following lines.227pub fn rest_of_line(&mut self) -> &'a str {228let begin = self.pos;229loop {230match self.next_ch() {231None | Some('\n') => return &self.source[begin..self.pos],232_ => {}233}234}235}236237// Scan a comment extending to the end of the current line.238fn scan_comment(&mut self) -> Result<LocatedToken<'a>, LocatedError> {239let loc = self.loc();240let text = self.rest_of_line();241token(Token::Comment(text), loc)242}243244// Scan a number token which can represent either an integer or floating point number.245//246// Accept the following forms:247//248// - `10`: Integer249// - `-10`: Integer250// - `0xff_00`: Integer251// - `0.0`: Float252// - `0x1.f`: Float253// - `-0x2.4`: Float254// - `0x0.4p-34`: Float255//256// This function does not filter out all invalid numbers. It depends in the context-sensitive257// decoding of the text for that. For example, the number of allowed digits in an `Ieee32` and258// an `Ieee64` constant are different.259fn scan_number(&mut self) -> Result<LocatedToken<'a>, LocatedError> {260let begin = self.pos;261let loc = self.loc();262let mut is_float = false;263264// Skip a leading sign.265match self.lookahead {266Some('-') => {267self.next_ch();268if !self.looking_at_numeric() {269// If the next characters won't parse as a number, we return Token::Minus270return token(Token::Minus, loc);271}272}273Some('+') => {274self.next_ch();275if !self.looking_at_numeric() {276// If the next characters won't parse as a number, we return Token::Plus277return token(Token::Plus, loc);278}279}280_ => {}281}282283// Check for NaNs with payloads.284if self.looking_at("NaN:") || self.looking_at("sNaN:") {285// Skip the `NaN:` prefix, the loop below won't accept it.286// We expect a hexadecimal number to follow the colon.287while self.next_ch() != Some(':') {}288is_float = true;289} else if self.looking_at("NaN") || self.looking_at("Inf") {290// This is Inf or a default quiet NaN.291is_float = true;292}293294// Look for the end of this number. Detect the radix point if there is one.295loop {296match self.next_ch() {297Some('-') | Some('_') => {}298Some('.') => is_float = true,299Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}300_ => break,301}302}303let text = &self.source[begin..self.pos];304if is_float {305token(Token::Float(text), loc)306} else {307token(Token::Integer(text), loc)308}309}310311// Scan a 'word', which is an identifier-like sequence of characters beginning with '_' or an312// alphabetic char, followed by zero or more alphanumeric or '_' characters.313fn scan_word(&mut self) -> Result<LocatedToken<'a>, LocatedError> {314let begin = self.pos;315let loc = self.loc();316317assert!(self.lookahead == Some('_') || self.lookahead.unwrap().is_ascii_alphabetic());318loop {319match self.next_ch() {320Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}321_ => break,322}323}324let text = &self.source[begin..self.pos];325326// Look for numbered well-known entities like block15, v45, ...327token(328split_entity_name(text)329.and_then(|(prefix, number)| {330Self::numbered_entity(prefix, number)331.or_else(|| Self::value_type(text, prefix, number))332})333.unwrap_or_else(|| match text {334"cold" => Token::Cold,335_ => Token::Identifier(text),336}),337loc,338)339}340341// If prefix is a well-known entity prefix and suffix is a valid entity number, return the342// decoded token.343fn numbered_entity(prefix: &str, number: u32) -> Option<Token<'a>> {344match prefix {345"v" => Value::with_number(number).map(Token::Value),346"block" => Block::with_number(number).map(Token::Block),347"ss" => Some(Token::StackSlot(number)),348"dss" => Some(Token::DynamicStackSlot(number)),349"dt" => Some(Token::DynamicType(number)),350"gv" => Some(Token::GlobalValue(number)),351"mt" => Some(Token::MemoryType(number)),352"const" => Some(Token::Constant(number)),353"fn" => Some(Token::FuncRef(number)),354"sig" => Some(Token::SigRef(number)),355"u" => Some(Token::UserRef(number)),356"userextname" => Some(Token::UserNameRef(number)),357"extable" => Some(Token::ExceptionTableRef(number)),358"tag" => Some(Token::ExceptionTag(number)),359"ret" => Some(Token::TryCallRet(number)),360"exn" => Some(Token::TryCallExn(number)),361_ => None,362}363}364365// Recognize a scalar or vector type.366fn value_type(text: &str, prefix: &str, number: u32) -> Option<Token<'a>> {367let is_vector = prefix.ends_with('x');368let scalar = if is_vector {369&prefix[0..prefix.len() - 1]370} else {371text372};373let base_type = match scalar {374"i8" => types::I8,375"i16" => types::I16,376"i32" => types::I32,377"i64" => types::I64,378"i128" => types::I128,379"f16" => types::F16,380"f32" => types::F32,381"f64" => types::F64,382"f128" => types::F128,383_ => return None,384};385if is_vector {386if number <= u32::from(u16::MAX) {387base_type.by(number).map(Token::Type)388} else {389None390}391} else {392Some(Token::Type(base_type))393}394}395396fn scan_name(&mut self) -> Result<LocatedToken<'a>, LocatedError> {397let loc = self.loc();398let begin = self.pos + 1;399400assert_eq!(self.lookahead, Some('%'));401402loop {403match self.next_ch() {404Some('_') | Some('0'..='9') | Some('a'..='z') | Some('A'..='Z') => {}405_ => break,406}407}408409let end = self.pos;410token(Token::Name(&self.source[begin..end]), loc)411}412413/// Scan for a multi-line quoted string with no escape character.414fn scan_string(&mut self) -> Result<LocatedToken<'a>, LocatedError> {415let loc = self.loc();416let begin = self.pos + 1;417418assert_eq!(self.lookahead, Some('"'));419420while let Some(c) = self.next_ch() {421if c == '"' {422break;423}424}425426let end = self.pos;427if self.lookahead != Some('"') {428return error(LexError::InvalidChar, self.loc());429}430self.next_ch();431token(Token::String(&self.source[begin..end]), loc)432}433434fn scan_hex_sequence(&mut self) -> Result<LocatedToken<'a>, LocatedError> {435let loc = self.loc();436let begin = self.pos + 1;437438assert_eq!(self.lookahead, Some('#'));439440while let Some(c) = self.next_ch() {441if !char::is_digit(c, 16) {442break;443}444}445446let end = self.pos;447token(Token::HexSequence(&self.source[begin..end]), loc)448}449450/// Given that we've consumed an `@` character, are we looking at a source451/// location?452fn looking_at_srcloc(&self) -> bool {453match self.lookahead {454Some(c) => char::is_digit(c, 16),455_ => false,456}457}458459fn scan_srcloc(&mut self, pos: usize, loc: Location) -> Result<LocatedToken<'a>, LocatedError> {460let begin = pos + 1;461while let Some(c) = self.next_ch() {462if !char::is_digit(c, 16) {463break;464}465}466467let end = self.pos;468token(Token::SourceLoc(&self.source[begin..end]), loc)469}470471/// Get the next token or a lexical error.472///473/// Return None when the end of the source is encountered.474pub fn next(&mut self) -> Option<Result<LocatedToken<'a>, LocatedError>> {475loop {476let loc = self.loc();477return match self.lookahead {478None => None,479Some(';') => Some(self.scan_comment()),480Some('(') => Some(self.scan_char(Token::LPar)),481Some(')') => Some(self.scan_char(Token::RPar)),482Some('{') => Some(self.scan_char(Token::LBrace)),483Some('}') => Some(self.scan_char(Token::RBrace)),484Some('[') => Some(self.scan_char(Token::LBracket)),485Some(']') => Some(self.scan_char(Token::RBracket)),486Some('<') => Some(self.scan_char(Token::LAngle)),487Some('>') => Some(self.scan_char(Token::RAngle)),488Some(',') => Some(self.scan_char(Token::Comma)),489Some('.') => Some(self.scan_char(Token::Dot)),490Some(':') => Some(self.scan_char(Token::Colon)),491Some('=') => Some(self.scan_char(Token::Equal)),492Some('!') => Some(self.scan_char(Token::Bang)),493Some('+') => Some(self.scan_number()),494Some('*') => Some(self.scan_char(Token::Multiply)),495Some('-') => {496if self.looking_at("->") {497Some(self.scan_chars(2, Token::Arrow))498} else {499Some(self.scan_number())500}501}502Some('0'..='9') => Some(self.scan_number()),503Some('a'..='z') | Some('A'..='Z') => {504if self.looking_at("NaN") || self.looking_at("Inf") {505Some(self.scan_number())506} else {507Some(self.scan_word())508}509}510Some('%') => Some(self.scan_name()),511Some('"') => Some(self.scan_string()),512Some('#') => Some(self.scan_hex_sequence()),513Some('@') => {514let pos = self.pos;515let loc = self.loc();516self.next_ch();517if self.looking_at_srcloc() {518Some(self.scan_srcloc(pos, loc))519} else {520Some(token(Token::At, loc))521}522}523// all ascii whitespace524Some(' ') | Some('\x09'..='\x0d') => {525self.next_ch();526continue;527}528_ => {529// Skip invalid char, return error.530self.next_ch();531Some(error(LexError::InvalidChar, loc))532}533};534}535}536}537538#[cfg(test)]539mod tests {540use super::*;541542#[test]543fn digits() {544assert_eq!(trailing_digits(""), 0);545assert_eq!(trailing_digits("x"), 0);546assert_eq!(trailing_digits("0x"), 0);547assert_eq!(trailing_digits("x1"), 1);548assert_eq!(trailing_digits("1x1"), 1);549assert_eq!(trailing_digits("1x01"), 2);550}551552#[test]553fn entity_name() {554assert_eq!(split_entity_name(""), None);555assert_eq!(split_entity_name("x"), None);556assert_eq!(split_entity_name("x+"), None);557assert_eq!(split_entity_name("x+1"), Some(("x+", 1)));558assert_eq!(split_entity_name("x-1"), Some(("x-", 1)));559assert_eq!(split_entity_name("1"), Some(("", 1)));560assert_eq!(split_entity_name("x1"), Some(("x", 1)));561assert_eq!(split_entity_name("xy0"), Some(("xy", 0)));562// Reject this non-canonical form.563assert_eq!(split_entity_name("inst01"), None);564}565566fn token<'a>(token: Token<'a>, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {567Some(super::token(token, Location { line_number: line }))568}569570fn error<'a>(error: LexError, line: usize) -> Option<Result<LocatedToken<'a>, LocatedError>> {571Some(super::error(error, Location { line_number: line }))572}573574#[test]575fn make_lexer() {576let mut l1 = Lexer::new("");577let mut l2 = Lexer::new(" ");578let mut l3 = Lexer::new("\n ");579580assert_eq!(l1.next(), None);581assert_eq!(l2.next(), None);582assert_eq!(l3.next(), None);583}584585#[test]586fn lex_comment() {587let mut lex = Lexer::new("; hello");588assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));589assert_eq!(lex.next(), None);590591lex = Lexer::new("\n ;hello\n;foo");592assert_eq!(lex.next(), token(Token::Comment(";hello"), 2));593assert_eq!(lex.next(), token(Token::Comment(";foo"), 3));594assert_eq!(lex.next(), None);595596// Scan a comment after an invalid char.597let mut lex = Lexer::new("$; hello");598assert_eq!(lex.next(), error(LexError::InvalidChar, 1));599assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));600assert_eq!(lex.next(), None);601}602603#[test]604fn lex_chars() {605let mut lex = Lexer::new("(); hello\n = :{, }.");606assert_eq!(lex.next(), token(Token::LPar, 1));607assert_eq!(lex.next(), token(Token::RPar, 1));608assert_eq!(lex.next(), token(Token::Comment("; hello"), 1));609assert_eq!(lex.next(), token(Token::Equal, 2));610assert_eq!(lex.next(), token(Token::Colon, 2));611assert_eq!(lex.next(), token(Token::LBrace, 2));612assert_eq!(lex.next(), token(Token::Comma, 2));613assert_eq!(lex.next(), token(Token::RBrace, 2));614assert_eq!(lex.next(), token(Token::Dot, 2));615assert_eq!(lex.next(), None);616}617618#[test]619fn lex_numbers() {620let mut lex = Lexer::new(" 0 2_000 -1,0xf -0x0 0.0 0x0.4p-34 NaN +5");621assert_eq!(lex.next(), token(Token::Integer("0"), 1));622assert_eq!(lex.next(), token(Token::Integer("2_000"), 1));623assert_eq!(lex.next(), token(Token::Integer("-1"), 1));624assert_eq!(lex.next(), token(Token::Comma, 1));625assert_eq!(lex.next(), token(Token::Integer("0xf"), 1));626assert_eq!(lex.next(), token(Token::Integer("-0x0"), 1));627assert_eq!(lex.next(), token(Token::Float("0.0"), 1));628assert_eq!(lex.next(), token(Token::Float("0x0.4p-34"), 1));629assert_eq!(lex.next(), token(Token::Float("NaN"), 1));630assert_eq!(lex.next(), token(Token::Integer("+5"), 1));631assert_eq!(lex.next(), None);632}633634#[test]635fn lex_identifiers() {636let mut lex = Lexer::new(637"v0 v00 vx01 block1234567890 block5234567890 v1x vx1 vxvx4 \638function0 function i8 i32x4 f32x5 f16 f128",639);640assert_eq!(641lex.next(),642token(Token::Value(Value::with_number(0).unwrap()), 1)643);644assert_eq!(lex.next(), token(Token::Identifier("v00"), 1));645assert_eq!(lex.next(), token(Token::Identifier("vx01"), 1));646assert_eq!(647lex.next(),648token(Token::Block(Block::with_number(1234567890).unwrap()), 1)649);650assert_eq!(lex.next(), token(Token::Identifier("block5234567890"), 1));651assert_eq!(lex.next(), token(Token::Identifier("v1x"), 1));652assert_eq!(lex.next(), token(Token::Identifier("vx1"), 1));653assert_eq!(lex.next(), token(Token::Identifier("vxvx4"), 1));654assert_eq!(lex.next(), token(Token::Identifier("function0"), 1));655assert_eq!(lex.next(), token(Token::Identifier("function"), 1));656assert_eq!(lex.next(), token(Token::Type(types::I8), 1));657assert_eq!(lex.next(), token(Token::Type(types::I32X4), 1));658assert_eq!(lex.next(), token(Token::Identifier("f32x5"), 1));659assert_eq!(lex.next(), token(Token::Type(types::F16), 1));660assert_eq!(lex.next(), token(Token::Type(types::F128), 1));661assert_eq!(lex.next(), None);662}663664#[test]665fn lex_hex_sequences() {666let mut lex = Lexer::new("#0 #DEADbeef123 #789");667668assert_eq!(lex.next(), token(Token::HexSequence("0"), 1));669assert_eq!(lex.next(), token(Token::HexSequence("DEADbeef123"), 1));670assert_eq!(lex.next(), token(Token::HexSequence("789"), 1));671}672673#[test]674fn lex_names() {675let mut lex = Lexer::new("%0 %x3 %function %123_abc %ss0 %v3 %block11 %const42 %_");676677assert_eq!(lex.next(), token(Token::Name("0"), 1));678assert_eq!(lex.next(), token(Token::Name("x3"), 1));679assert_eq!(lex.next(), token(Token::Name("function"), 1));680assert_eq!(lex.next(), token(Token::Name("123_abc"), 1));681assert_eq!(lex.next(), token(Token::Name("ss0"), 1));682assert_eq!(lex.next(), token(Token::Name("v3"), 1));683assert_eq!(lex.next(), token(Token::Name("block11"), 1));684assert_eq!(lex.next(), token(Token::Name("const42"), 1));685assert_eq!(lex.next(), token(Token::Name("_"), 1));686}687688#[test]689fn lex_strings() {690let mut lex = Lexer::new(691r#""" "0" "x3""function" "123 abc" "\" "start692and end on693different lines" "#,694);695696assert_eq!(lex.next(), token(Token::String(""), 1));697assert_eq!(lex.next(), token(Token::String("0"), 1));698assert_eq!(lex.next(), token(Token::String("x3"), 1));699assert_eq!(lex.next(), token(Token::String("function"), 1));700assert_eq!(lex.next(), token(Token::String("123 abc"), 1));701assert_eq!(lex.next(), token(Token::String(r#"\"#), 1));702assert_eq!(703lex.next(),704token(705Token::String(706r#"start707and end on708different lines"#709),7101711)712);713}714715#[test]716fn lex_userrefs() {717let mut lex = Lexer::new("u0 u1 u234567890 u9:8765");718719assert_eq!(lex.next(), token(Token::UserRef(0), 1));720assert_eq!(lex.next(), token(Token::UserRef(1), 1));721assert_eq!(lex.next(), token(Token::UserRef(234567890), 1));722assert_eq!(lex.next(), token(Token::UserRef(9), 1));723assert_eq!(lex.next(), token(Token::Colon, 1));724assert_eq!(lex.next(), token(Token::Integer("8765"), 1));725assert_eq!(lex.next(), None);726}727}728729730