// Package scanner implements a lexical scanner for River source files.1package scanner23import (4"fmt"5"unicode"6"unicode/utf8"78"github.com/grafana/agent/pkg/river/token"9)1011// EBNF for the scanner:12//13// letter = /* any unicode letter class character */ | "_"14// number = /* any unicode number class character */15// digit = /* ASCII characters 0 through 9 */16// digits = digit { digit }17// string_character = /* any unicode character that isn't '"' */18//19// COMMENT = line_comment | block_comment20// line_comment = "//" { character }21// block_comment = "/*" { character | newline } "*/"22//23// IDENT = letter { letter | number }24// NULL = "null"25// BOOL = "true" | "false"26// NUMBER = digits27// FLOAT = ( digits | "." digits ) [ "e" [ "+" | "-" ] digits ]28// STRING = '"' { string_character | escape_sequence } '"'29// OR = "||"30// AND = "&&"31// NOT = "!"32// NEQ = "!="33// ASSIGN = "="34// EQ = "=="35// LT = "<"36// LTE = "<="37// GT = ">"38// GTE = ">="39// ADD = "+"40// SUB = "-"41// MUL = "*"42// DIV = "/"43// MOD = "%"44// POW = "^"45// LCURLY = "{"46// RCURLY = "}"47// LPAREN = "("48// RPAREN = ")"49// LBRACK = "["50// RBRACK = "]"51// COMMA = ","52// DOT = "."53//54// The EBNF for escape_sequence is currently undocumented; see scanEscape for55// details. The escape sequences supported by River are the same as the escape56// sequences supported by Go, except that it is always valid to use \' in57// strings (which in Go, is only valid to use in character literals).5859// ErrorHandler is invoked whenever there is an error.60type ErrorHandler func(pos token.Pos, msg string)6162// Mode is a set of bitwise flags which control scanner behavior.63type Mode uint6465const (66// IncludeComments will cause comments to be returned as comment tokens.67// Otherwise, comments are ignored.68IncludeComments Mode = 1 << iota6970// Avoids automatic insertion of terminators (for testing only).71dontInsertTerms72)7374const (75bom = 0xFEFF // byte order mark, permitted as very first character76eof = -1 // end of file77)7879// Scanner holds the internal state for the tokenizer while processing configs.80type Scanner struct {81file *token.File // Config file handle for tracking line offsets82input []byte // Input config83err ErrorHandler // Error reporting (may be nil)84mode Mode8586// scanning state variables:8788ch rune // Current character89offset int // Byte offset of ch90readOffset int // Byte offset of first character *after* ch91insertTerm bool // Insert a newline before the next newline92numErrors int // Number of errors encountered during scanning93}9495// New creates a new scanner to tokenize the provided input config. The scanner96// uses the provided file for adding line information for each token. The mode97// parameter customizes scanner behavior.98//99// Calls to Scan will invoke the error handler eh when a lexical error is found100// if eh is not nil.101func New(file *token.File, input []byte, eh ErrorHandler, mode Mode) *Scanner {102s := &Scanner{103file: file,104input: input,105err: eh,106mode: mode,107}108109// Preload first character.110s.next()111if s.ch == bom {112s.next() // Ignore BOM if it's the first character.113}114return s115}116117// peek gets the next byte after the current character without advancing the118// scanner. Returns 0 if the scanner is at EOF.119func (s *Scanner) peek() byte {120if s.readOffset < len(s.input) {121return s.input[s.readOffset]122}123return 0124}125126// next advances the scanner and reads the next Unicode character into s.ch.127// s.ch == eof indicates end of file.128func (s *Scanner) next() {129if s.readOffset >= len(s.input) {130s.offset = len(s.input)131if s.ch == '\n' {132// Make sure we track final newlines at the end of the file133s.file.AddLine(s.offset)134}135s.ch = eof136return137}138139s.offset = s.readOffset140if s.ch == '\n' {141s.file.AddLine(s.offset)142}143144r, width := rune(s.input[s.readOffset]), 1145switch {146case r == 0:147s.onError(s.offset, "illegal character NUL")148case r >= utf8.RuneSelf:149r, width = utf8.DecodeRune(s.input[s.readOffset:])150if r == utf8.RuneError && width == 1 {151s.onError(s.offset, "illegal UTF-8 encoding")152} else if r == bom && s.offset > 0 {153s.onError(s.offset, "illegal byte order mark")154}155}156s.readOffset += width157s.ch = r158}159160func (s *Scanner) onError(offset int, msg string) {161if s.err != nil {162s.err(s.file.Pos(offset), msg)163}164s.numErrors++165}166167// NumErrors returns the current number of errors encountered during scanning.168// This is useful as a fallback to detect errors when no ErrorHandler was169// provided to the scanner.170func (s *Scanner) NumErrors() int { return s.numErrors }171172// Scan scans the next token and returns the token's position, the token173// itself, and the token's literal string (when applicable). The end of the174// input is indicated by token.EOF.175//176// If the returned token is a literal (such as token.STRING), then lit contains177// the corresponding literal text (including surrounding quotes).178//179// If the returned token is a keyword, lit is the keyword text that was180// scanned.181//182// If the returned token is token.TERMINATOR, lit will contain "\n".183//184// If the returned token is token.ILLEGAL, lit contains the offending185// character.186//187// In all other cases, lit will be an empty string.188//189// For more tolerant parsing, Scan returns a valid token character whenever190// possible when a syntax error was encountered. Callers must check NumErrors191// or the number of times the provided ErrorHandler was invoked to ensure there192// were no errors found during scanning.193//194// Scan will inject line information to the file provided by NewScanner.195// Returned token positions are relative to that file.196func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {197scanAgain:198s.skipWhitespace()199200// Start of current token.201pos = s.file.Pos(s.offset)202203var insertTerm bool204205// Determine token value206switch ch := s.ch; {207case isLetter(ch):208lit = s.scanIdentifier()209if len(lit) > 1 { // Keywords are always > 1 char210tok = token.Lookup(lit)211switch tok {212case token.IDENT, token.NULL, token.BOOL:213insertTerm = true214}215} else {216insertTerm = true217tok = token.IDENT218}219220case isDecimal(ch) || (ch == '.' && isDecimal(rune(s.peek()))):221insertTerm = true222tok, lit = s.scanNumber()223224default:225s.next() // Make progress226227// ch is now the first character in a sequence and s.ch is the second228// character.229230switch ch {231case eof:232if s.insertTerm {233s.insertTerm = false // Consumed EOF234return pos, token.TERMINATOR, "\n"235}236tok = token.EOF237238case '\n':239// This case is only reachable when s.insertTerm is true, since otherwise240// skipWhitespace consumes all other newlines.241s.insertTerm = false // Consumed newline242return pos, token.TERMINATOR, "\n"243244case '\'':245s.onError(pos.Offset(), "illegal single-quoted string; use double quotes")246insertTerm = true247tok = token.ILLEGAL248lit = s.scanString('\'')249250case '"':251insertTerm = true252tok = token.STRING253lit = s.scanString('"')254255case '|':256if s.ch != '|' {257s.onError(s.offset, "missing second | in ||")258} else {259s.next() // consume second '|'260}261tok = token.OR262case '&':263if s.ch != '&' {264s.onError(s.offset, "missing second & in &&")265} else {266s.next() // consume second '&'267}268tok = token.AND269270case '!': // !, !=271tok = s.switch2(token.NOT, token.NEQ, '=')272case '=': // =, ==273tok = s.switch2(token.ASSIGN, token.EQ, '=')274case '<': // <, <=275tok = s.switch2(token.LT, token.LTE, '=')276case '>': // >, >=277tok = s.switch2(token.GT, token.GTE, '=')278case '+':279tok = token.ADD280case '-':281tok = token.SUB282case '*':283tok = token.MUL284case '/':285if s.ch == '/' || s.ch == '*' {286// //- or /*-style comment.287//288// If we're expected to inject a terminator, we can only do so if our289// comment goes to the end of the line. Otherwise, the terminator will290// have to be injected after the comment token.291if s.insertTerm && s.findLineEnd() {292// Reset position to the beginning of the comment.293s.ch = '/'294s.offset = pos.Offset()295s.readOffset = s.offset + 1296s.insertTerm = false // Consumed newline297return pos, token.TERMINATOR, "\n"298}299comment := s.scanComment()300if s.mode&IncludeComments == 0 {301// Skip over comment302s.insertTerm = false // Consumed newline303goto scanAgain304}305tok = token.COMMENT306lit = comment307} else {308tok = token.DIV309}310311case '%':312tok = token.MOD313case '^':314tok = token.POW315case '{':316tok = token.LCURLY317case '}':318insertTerm = true319tok = token.RCURLY320case '(':321tok = token.LPAREN322case ')':323insertTerm = true324tok = token.RPAREN325case '[':326tok = token.LBRACK327case ']':328insertTerm = true329tok = token.RBRACK330case ',':331tok = token.COMMA332case '.':333// NOTE: Fractions starting with '.' are handled by outer switch334tok = token.DOT335336default:337// s.next() reports invalid BOMs so we don't need to repeat the error.338if ch != bom {339s.onError(pos.Offset(), fmt.Sprintf("illegal character %#U", ch))340}341insertTerm = s.insertTerm // Preserve previous s.insertTerm state342tok = token.ILLEGAL343lit = string(ch)344}345}346347if s.mode&dontInsertTerms == 0 {348s.insertTerm = insertTerm349}350return351}352353func (s *Scanner) skipWhitespace() {354for s.ch == ' ' || s.ch == '\t' || s.ch == '\r' || (s.ch == '\n' && !s.insertTerm) {355s.next()356}357}358359func isLetter(ch rune) bool {360// We check for ASCII first as an optimization, and leave checking unicode361// (the slowest) to the very end.362return (lower(ch) >= 'a' && lower(ch) <= 'z') ||363ch == '_' ||364(ch >= utf8.RuneSelf && unicode.IsLetter(ch))365}366367func lower(ch rune) rune { return ('a' - 'A') | ch }368func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }369func isDigit(ch rune) bool {370return isDecimal(ch) || (ch >= utf8.RuneSelf && unicode.IsDigit(ch))371}372373// scanIdentifier reads the string of valid identifier characters starting at374// s.offet. It must only be called when s.ch is a valid character which starts375// an identifier.376//377// scanIdentifier is highly optimized for identifiers are modifications must be378// made carefully.379func (s *Scanner) scanIdentifier() string {380off := s.offset381382// Optimize for common case of ASCII identifiers.383//384// Ranging over s.input[s.readOffset:] avoids bounds checks and avoids385// conversions to runes.386//387// We'll fall back to the slower path if we find a non-ASCII character.388for readOffset, b := range s.input[s.readOffset:] {389if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || (b >= '0' && b <= '9') {390// Common case: ASCII character; don't assign a rune.391continue392}393s.readOffset += readOffset394if b > 0 && b < utf8.RuneSelf {395// Optimization: ASCII character that isn't a letter or number; we've396// reached the end of the identifier sequence and can terminate. We avoid397// the call to s.next() and the corresponding setup.398//399// This optimization only works because we know that s.ch (the current400// character when scanIdentifier was called) is never '\n' since '\n'401// cannot start an identifier.402s.ch = rune(b)403s.offset = s.readOffset404s.readOffset++405goto exit406}407408// The preceding character is valid for an identifier because409// scanIdentifier is only called when s.ch is a letter; calling s.next() at410// s.readOffset will reset the scanner state.411s.next()412for isLetter(s.ch) || isDigit(s.ch) {413s.next()414}415416// No more valid characters for the identifier; terminate.417goto exit418}419420s.offset = len(s.input)421s.readOffset = len(s.input)422s.ch = eof423424exit:425return string(s.input[off:s.offset])426}427428func (s *Scanner) scanNumber() (tok token.Token, lit string) {429tok = token.NUMBER430off := s.offset431432// Integer part of number433if s.ch != '.' {434s.digits()435}436437// Fractional part of number438if s.ch == '.' {439tok = token.FLOAT440441s.next()442s.digits()443}444445// Exponent446if lower(s.ch) == 'e' {447tok = token.FLOAT448449s.next()450if s.ch == '+' || s.ch == '-' {451s.next()452}453454if s.digits() == 0 {455s.onError(off, "exponent has no digits")456}457}458459return tok, string(s.input[off:s.offset])460}461462// digits scans a sequence of digits.463func (s *Scanner) digits() (count int) {464for isDecimal(s.ch) {465s.next()466count++467}468return469}470471func (s *Scanner) scanString(until rune) string {472// subtract 1 to account for the opening '"' which was already consumed by473// the scanner forcing progress.474off := s.offset - 1475476for {477ch := s.ch478if ch == '\n' || ch == eof {479s.onError(off, "string literal not terminated")480break481}482s.next()483if ch == until {484break485}486if ch == '\\' {487s.scanEscape()488}489}490491return string(s.input[off:s.offset])492}493494// scanEscape parses an escape sequence. In case of a syntax error, scanEscape495// stops at the offending character without consuming it.496func (s *Scanner) scanEscape() {497off := s.offset498499var (500n int501base, max uint32502)503504switch s.ch {505case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':506s.next()507return508case '0', '1', '2', '3', '4', '5', '6', '7':509n, base, max = 3, 8, 255510case 'x':511s.next()512n, base, max = 2, 16, 255513case 'u':514s.next()515n, base, max = 4, 16, unicode.MaxRune516case 'U':517s.next()518n, base, max = 8, 16, unicode.MaxRune519default:520msg := "unknown escape sequence"521if s.ch == eof {522msg = "escape sequence not terminated"523}524s.onError(off, msg)525return526}527528var x uint32529for n > 0 {530d := uint32(digitVal(s.ch))531if d >= base {532msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)533if s.ch == eof {534msg = "escape sequence not terminated"535}536s.onError(off, msg)537return538}539x = x*base + d540s.next()541n--542}543544if x > max || x >= 0xD800 && x < 0xE000 {545s.onError(off, "escape sequence is invalid Unicode code point")546}547}548549func digitVal(ch rune) int {550switch {551case ch >= '0' && ch <= '9':552return int(ch - '0')553case lower(ch) >= 'a' && lower(ch) <= 'f':554return int(lower(ch) - 'a' + 10)555}556return 16 // Larger than any legal digit val557}558559func (s *Scanner) scanComment() string {560// The initial character in the comment was already consumed from the scanner561// forcing progress.562//563// slashComment will be true when the comment is a //- or /*-style comment.564565var (566off = s.offset - 1 // Offset of initial character567numCR = 0568569blockComment = false570)571572if s.ch == '/' { // NOTE: s.ch is second character in comment sequence573// //-style comment.574//575// The final '\n' is not considered to be part of the comment.576if s.ch == '/' {577s.next() // Consume second '/'578}579580for s.ch != '\n' && s.ch != eof {581if s.ch == '\r' {582numCR++583}584s.next()585}586587goto exit588}589590// /*-style comment.591blockComment = true592s.next()593for s.ch != eof {594ch := s.ch595if ch == '\r' {596numCR++597}598s.next()599if ch == '*' && s.ch == '/' {600s.next()601goto exit602}603}604605s.onError(off, "block comment not terminated")606607exit:608lit := s.input[off:s.offset]609610// On Windows, a single comment line may end in "\r\n". We want to remove the611// final \r.612if numCR > 0 && len(lit) >= 1 && lit[len(lit)-1] == '\r' {613lit = lit[:len(lit)-1]614numCR--615}616617if numCR > 0 {618lit = stripCR(lit, blockComment)619}620621return string(lit)622}623624func stripCR(b []byte, blockComment bool) []byte {625c := make([]byte, len(b))626i := 0627628for j, ch := range b {629if ch != '\r' || blockComment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {630c[i] = ch631i++632}633}634635return c[:i]636}637638// findLineEnd checks to see if a comment runs to the end of the line.639func (s *Scanner) findLineEnd() bool {640// NOTE: initial '/' is already consumed by forcing the scanner to progress.641642defer func(off int) {643// Reset scanner state to where it was upon calling findLineEnd.644s.ch = '/'645s.offset = off646s.readOffset = off + 1647s.next() // Consume initial starting '/' again648}(s.offset - 1)649650// Read ahead until a newline, EOF, or non-comment token is found.651// We loop to consume multiple sequences of comment tokens.652for s.ch == '/' || s.ch == '*' {653if s.ch == '/' {654// //-style comments always contain newlines.655return true656}657658// We're looking at a /*-style comment; look for its newline.659s.next()660for s.ch != eof {661ch := s.ch662if ch == '\n' {663return true664}665s.next()666if ch == '*' && s.ch == '/' { // End of block comment667s.next()668break669}670}671672// Check to see if there's a newline after the block comment.673s.skipWhitespace() // s.insertTerm is set674if s.ch == eof || s.ch == '\n' {675return true676}677if s.ch != '/' {678// Non-comment token679return false680}681s.next() // Consume '/' at the end of the /* style-comment682}683684return false685}686687// switch2 returns a if s.ch is next, b otherwise. The scanner will be advanced688// if b is returned.689//690// This is used for tokens which can either be a single character but also are691// the starting character for a 2-length token (i.e., = and ==).692func (s *Scanner) switch2(a, b token.Token, next rune) token.Token { //nolint:unparam693if s.ch == next {694s.next()695return b696}697return a698}699700701