CoCalc -- scanner.go

GitHub Repository: aos/grafana-agent
Path: blob/main/pkg/river/scanner/scanner.go
⁴⁰⁹⁶ views
1
// Package scanner implements a lexical scanner for River source files.
2
package scanner
3

4
import (
5
	"fmt"
6
	"unicode"
7
	"unicode/utf8"
8

9
	"github.com/grafana/agent/pkg/river/token"
10
)
11

12
// EBNF for the scanner:
13
//
14
//   letter           = /* any unicode letter class character */ | "_"
15
//   number           = /* any unicode number class character */
16
//   digit            = /* ASCII characters 0 through 9 */
17
//   digits           = digit { digit }
18
//   string_character = /* any unicode character that isn't '"' */
19
//
20
//   COMMENT       = line_comment | block_comment
21
//   line_comment  = "//" { character }
22
//   block_comment = "/*" { character | newline } "*/"
23
//
24
//   IDENT   = letter { letter | number }
25
//   NULL    = "null"
26
//   BOOL    = "true" | "false"
27
//   NUMBER  = digits
28
//   FLOAT   = ( digits | "." digits ) [ "e" [ "+" | "-" ] digits ]
29
//   STRING  = '"' { string_character | escape_sequence } '"'
30
//   OR      = "||"
31
//   AND     = "&&"
32
//   NOT     = "!"
33
//   NEQ     = "!="
34
//   ASSIGN  = "="
35
//   EQ      = "=="
36
//   LT      = "<"
37
//   LTE     = "<="
38
//   GT      = ">"
39
//   GTE     = ">="
40
//   ADD     = "+"
41
//   SUB     = "-"
42
//   MUL     = "*"
43
//   DIV     = "/"
44
//   MOD     = "%"
45
//   POW     = "^"
46
//   LCURLY  = "{"
47
//   RCURLY  = "}"
48
//   LPAREN  = "("
49
//   RPAREN  = ")"
50
//   LBRACK  = "["
51
//   RBRACK  = "]"
52
//   COMMA   = ","
53
//   DOT     = "."
54
//
55
// The EBNF for escape_sequence is currently undocumented; see scanEscape for
56
// details. The escape sequences supported by River are the same as the escape
57
// sequences supported by Go, except that it is always valid to use \' in
58
// strings (which in Go, is only valid to use in character literals).
59

60
// ErrorHandler is invoked whenever there is an error.
61
type ErrorHandler func(pos token.Pos, msg string)
62

63
// Mode is a set of bitwise flags which control scanner behavior.
64
type Mode uint
65

66
const (
67
	// IncludeComments will cause comments to be returned as comment tokens.
68
	// Otherwise, comments are ignored.
69
	IncludeComments Mode = 1 << iota
70

71
	// Avoids automatic insertion of terminators (for testing only).
72
	dontInsertTerms
73
)
74

75
const (
76
	bom = 0xFEFF // byte order mark, permitted as very first character
77
	eof = -1     // end of file
78
)
79

80
// Scanner holds the internal state for the tokenizer while processing configs.
81
type Scanner struct {
82
	file  *token.File  // Config file handle for tracking line offsets
83
	input []byte       // Input config
84
	err   ErrorHandler // Error reporting (may be nil)
85
	mode  Mode
86

87
	// scanning state variables:
88

89
	ch         rune // Current character
90
	offset     int  // Byte offset of ch
91
	readOffset int  // Byte offset of first character *after* ch
92
	insertTerm bool // Insert a newline before the next newline
93
	numErrors  int  // Number of errors encountered during scanning
94
}
95

96
// New creates a new scanner to tokenize the provided input config. The scanner
97
// uses the provided file for adding line information for each token. The mode
98
// parameter customizes scanner behavior.
99
//
100
// Calls to Scan will invoke the error handler eh when a lexical error is found
101
// if eh is not nil.
102
func New(file *token.File, input []byte, eh ErrorHandler, mode Mode) *Scanner {
103
	s := &Scanner{
104
		file:  file,
105
		input: input,
106
		err:   eh,
107
		mode:  mode,
108
	}
109

110
	// Preload first character.
111
	s.next()
112
	if s.ch == bom {
113
		s.next() // Ignore BOM if it's the first character.
114
	}
115
	return s
116
}
117

118
// peek gets the next byte after the current character without advancing the
119
// scanner. Returns 0 if the scanner is at EOF.
120
func (s *Scanner) peek() byte {
121
	if s.readOffset < len(s.input) {
122
		return s.input[s.readOffset]
123
	}
124
	return 0
125
}
126

127
// next advances the scanner and reads the next Unicode character into s.ch.
128
// s.ch == eof indicates end of file.
129
func (s *Scanner) next() {
130
	if s.readOffset >= len(s.input) {
131
		s.offset = len(s.input)
132
		if s.ch == '\n' {
133
			// Make sure we track final newlines at the end of the file
134
			s.file.AddLine(s.offset)
135
		}
136
		s.ch = eof
137
		return
138
	}
139

140
	s.offset = s.readOffset
141
	if s.ch == '\n' {
142
		s.file.AddLine(s.offset)
143
	}
144

145
	r, width := rune(s.input[s.readOffset]), 1
146
	switch {
147
	case r == 0:
148
		s.onError(s.offset, "illegal character NUL")
149
	case r >= utf8.RuneSelf:
150
		r, width = utf8.DecodeRune(s.input[s.readOffset:])
151
		if r == utf8.RuneError && width == 1 {
152
			s.onError(s.offset, "illegal UTF-8 encoding")
153
		} else if r == bom && s.offset > 0 {
154
			s.onError(s.offset, "illegal byte order mark")
155
		}
156
	}
157
	s.readOffset += width
158
	s.ch = r
159
}
160

161
func (s *Scanner) onError(offset int, msg string) {
162
	if s.err != nil {
163
		s.err(s.file.Pos(offset), msg)
164
	}
165
	s.numErrors++
166
}
167

168
// NumErrors returns the current number of errors encountered during scanning.
169
// This is useful as a fallback to detect errors when no ErrorHandler was
170
// provided to the scanner.
171
func (s *Scanner) NumErrors() int { return s.numErrors }
172

173
// Scan scans the next token and returns the token's position, the token
174
// itself, and the token's literal string (when applicable). The end of the
175
// input is indicated by token.EOF.
176
//
177
// If the returned token is a literal (such as token.STRING), then lit contains
178
// the corresponding literal text (including surrounding quotes).
179
//
180
// If the returned token is a keyword, lit is the keyword text that was
181
// scanned.
182
//
183
// If the returned token is token.TERMINATOR, lit will contain "\n".
184
//
185
// If the returned token is token.ILLEGAL, lit contains the offending
186
// character.
187
//
188
// In all other cases, lit will be an empty string.
189
//
190
// For more tolerant parsing, Scan returns a valid token character whenever
191
// possible when a syntax error was encountered. Callers must check NumErrors
192
// or the number of times the provided ErrorHandler was invoked to ensure there
193
// were no errors found during scanning.
194
//
195
// Scan will inject line information to the file provided by NewScanner.
196
// Returned token positions are relative to that file.
197
func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
198
scanAgain:
199
	s.skipWhitespace()
200

201
	// Start of current token.
202
	pos = s.file.Pos(s.offset)
203

204
	var insertTerm bool
205

206
	// Determine token value
207
	switch ch := s.ch; {
208
	case isLetter(ch):
209
		lit = s.scanIdentifier()
210
		if len(lit) > 1 { // Keywords are always > 1 char
211
			tok = token.Lookup(lit)
212
			switch tok {
213
			case token.IDENT, token.NULL, token.BOOL:
214
				insertTerm = true
215
			}
216
		} else {
217
			insertTerm = true
218
			tok = token.IDENT
219
		}
220

221
	case isDecimal(ch) || (ch == '.' && isDecimal(rune(s.peek()))):
222
		insertTerm = true
223
		tok, lit = s.scanNumber()
224

225
	default:
226
		s.next() // Make progress
227

228
		// ch is now the first character in a sequence and s.ch is the second
229
		// character.
230

231
		switch ch {
232
		case eof:
233
			if s.insertTerm {
234
				s.insertTerm = false // Consumed EOF
235
				return pos, token.TERMINATOR, "\n"
236
			}
237
			tok = token.EOF
238

239
		case '\n':
240
			// This case is only reachable when s.insertTerm is true, since otherwise
241
			// skipWhitespace consumes all other newlines.
242
			s.insertTerm = false // Consumed newline
243
			return pos, token.TERMINATOR, "\n"
244

245
		case '\'':
246
			s.onError(pos.Offset(), "illegal single-quoted string; use double quotes")
247
			insertTerm = true
248
			tok = token.ILLEGAL
249
			lit = s.scanString('\'')
250

251
		case '"':
252
			insertTerm = true
253
			tok = token.STRING
254
			lit = s.scanString('"')
255

256
		case '|':
257
			if s.ch != '|' {
258
				s.onError(s.offset, "missing second | in ||")
259
			} else {
260
				s.next() // consume second '|'
261
			}
262
			tok = token.OR
263
		case '&':
264
			if s.ch != '&' {
265
				s.onError(s.offset, "missing second & in &&")
266
			} else {
267
				s.next() // consume second '&'
268
			}
269
			tok = token.AND
270

271
		case '!': // !, !=
272
			tok = s.switch2(token.NOT, token.NEQ, '=')
273
		case '=': // =, ==
274
			tok = s.switch2(token.ASSIGN, token.EQ, '=')
275
		case '<': // <, <=
276
			tok = s.switch2(token.LT, token.LTE, '=')
277
		case '>': // >, >=
278
			tok = s.switch2(token.GT, token.GTE, '=')
279
		case '+':
280
			tok = token.ADD
281
		case '-':
282
			tok = token.SUB
283
		case '*':
284
			tok = token.MUL
285
		case '/':
286
			if s.ch == '/' || s.ch == '*' {
287
				// //- or /*-style comment.
288
				//
289
				// If we're expected to inject a terminator, we can only do so if our
290
				// comment goes to the end of the line. Otherwise, the terminator will
291
				// have to be injected after the comment token.
292
				if s.insertTerm && s.findLineEnd() {
293
					// Reset position to the beginning of the comment.
294
					s.ch = '/'
295
					s.offset = pos.Offset()
296
					s.readOffset = s.offset + 1
297
					s.insertTerm = false // Consumed newline
298
					return pos, token.TERMINATOR, "\n"
299
				}
300
				comment := s.scanComment()
301
				if s.mode&IncludeComments == 0 {
302
					// Skip over comment
303
					s.insertTerm = false // Consumed newline
304
					goto scanAgain
305
				}
306
				tok = token.COMMENT
307
				lit = comment
308
			} else {
309
				tok = token.DIV
310
			}
311

312
		case '%':
313
			tok = token.MOD
314
		case '^':
315
			tok = token.POW
316
		case '{':
317
			tok = token.LCURLY
318
		case '}':
319
			insertTerm = true
320
			tok = token.RCURLY
321
		case '(':
322
			tok = token.LPAREN
323
		case ')':
324
			insertTerm = true
325
			tok = token.RPAREN
326
		case '[':
327
			tok = token.LBRACK
328
		case ']':
329
			insertTerm = true
330
			tok = token.RBRACK
331
		case ',':
332
			tok = token.COMMA
333
		case '.':
334
			// NOTE: Fractions starting with '.' are handled by outer switch
335
			tok = token.DOT
336

337
		default:
338
			// s.next() reports invalid BOMs so we don't need to repeat the error.
339
			if ch != bom {
340
				s.onError(pos.Offset(), fmt.Sprintf("illegal character %#U", ch))
341
			}
342
			insertTerm = s.insertTerm // Preserve previous s.insertTerm state
343
			tok = token.ILLEGAL
344
			lit = string(ch)
345
		}
346
	}
347

348
	if s.mode&dontInsertTerms == 0 {
349
		s.insertTerm = insertTerm
350
	}
351
	return
352
}
353

354
func (s *Scanner) skipWhitespace() {
355
	for s.ch == ' ' || s.ch == '\t' || s.ch == '\r' || (s.ch == '\n' && !s.insertTerm) {
356
		s.next()
357
	}
358
}
359

360
func isLetter(ch rune) bool {
361
	// We check for ASCII first as an optimization, and leave checking unicode
362
	// (the slowest) to the very end.
363
	return (lower(ch) >= 'a' && lower(ch) <= 'z') ||
364
		ch == '_' ||
365
		(ch >= utf8.RuneSelf && unicode.IsLetter(ch))
366
}
367

368
func lower(ch rune) rune     { return ('a' - 'A') | ch }
369
func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
370
func isDigit(ch rune) bool {
371
	return isDecimal(ch) || (ch >= utf8.RuneSelf && unicode.IsDigit(ch))
372
}
373

374
// scanIdentifier reads the string of valid identifier characters starting at
375
// s.offet. It must only be called when s.ch is a valid character which starts
376
// an identifier.
377
//
378
// scanIdentifier is highly optimized for identifiers are modifications must be
379
// made carefully.
380
func (s *Scanner) scanIdentifier() string {
381
	off := s.offset
382

383
	// Optimize for common case of ASCII identifiers.
384
	//
385
	// Ranging over s.input[s.readOffset:] avoids bounds checks and avoids
386
	// conversions to runes.
387
	//
388
	// We'll fall back to the slower path if we find a non-ASCII character.
389
	for readOffset, b := range s.input[s.readOffset:] {
390
		if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || (b >= '0' && b <= '9') {
391
			// Common case: ASCII character; don't assign a rune.
392
			continue
393
		}
394
		s.readOffset += readOffset
395
		if b > 0 && b < utf8.RuneSelf {
396
			// Optimization: ASCII character that isn't a letter or number; we've
397
			// reached the end of the identifier sequence and can terminate. We avoid
398
			// the call to s.next() and the corresponding setup.
399
			//
400
			// This optimization only works because we know that s.ch (the current
401
			// character when scanIdentifier was called) is never '\n' since '\n'
402
			// cannot start an identifier.
403
			s.ch = rune(b)
404
			s.offset = s.readOffset
405
			s.readOffset++
406
			goto exit
407
		}
408

409
		// The preceding character is valid for an identifier because
410
		// scanIdentifier is only called when s.ch is a letter; calling s.next() at
411
		// s.readOffset will reset the scanner state.
412
		s.next()
413
		for isLetter(s.ch) || isDigit(s.ch) {
414
			s.next()
415
		}
416

417
		// No more valid characters for the identifier; terminate.
418
		goto exit
419
	}
420

421
	s.offset = len(s.input)
422
	s.readOffset = len(s.input)
423
	s.ch = eof
424

425
exit:
426
	return string(s.input[off:s.offset])
427
}
428

429
func (s *Scanner) scanNumber() (tok token.Token, lit string) {
430
	tok = token.NUMBER
431
	off := s.offset
432

433
	// Integer part of number
434
	if s.ch != '.' {
435
		s.digits()
436
	}
437

438
	// Fractional part of number
439
	if s.ch == '.' {
440
		tok = token.FLOAT
441

442
		s.next()
443
		s.digits()
444
	}
445

446
	// Exponent
447
	if lower(s.ch) == 'e' {
448
		tok = token.FLOAT
449

450
		s.next()
451
		if s.ch == '+' || s.ch == '-' {
452
			s.next()
453
		}
454

455
		if s.digits() == 0 {
456
			s.onError(off, "exponent has no digits")
457
		}
458
	}
459

460
	return tok, string(s.input[off:s.offset])
461
}
462

463
// digits scans a sequence of digits.
464
func (s *Scanner) digits() (count int) {
465
	for isDecimal(s.ch) {
466
		s.next()
467
		count++
468
	}
469
	return
470
}
471

472
func (s *Scanner) scanString(until rune) string {
473
	// subtract 1 to account for the opening '"' which was already consumed by
474
	// the scanner forcing progress.
475
	off := s.offset - 1
476

477
	for {
478
		ch := s.ch
479
		if ch == '\n' || ch == eof {
480
			s.onError(off, "string literal not terminated")
481
			break
482
		}
483
		s.next()
484
		if ch == until {
485
			break
486
		}
487
		if ch == '\\' {
488
			s.scanEscape()
489
		}
490
	}
491

492
	return string(s.input[off:s.offset])
493
}
494

495
// scanEscape parses an escape sequence. In case of a syntax error, scanEscape
496
// stops at the offending character without consuming it.
497
func (s *Scanner) scanEscape() {
498
	off := s.offset
499

500
	var (
501
		n         int
502
		base, max uint32
503
	)
504

505
	switch s.ch {
506
	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
507
		s.next()
508
		return
509
	case '0', '1', '2', '3', '4', '5', '6', '7':
510
		n, base, max = 3, 8, 255
511
	case 'x':
512
		s.next()
513
		n, base, max = 2, 16, 255
514
	case 'u':
515
		s.next()
516
		n, base, max = 4, 16, unicode.MaxRune
517
	case 'U':
518
		s.next()
519
		n, base, max = 8, 16, unicode.MaxRune
520
	default:
521
		msg := "unknown escape sequence"
522
		if s.ch == eof {
523
			msg = "escape sequence not terminated"
524
		}
525
		s.onError(off, msg)
526
		return
527
	}
528

529
	var x uint32
530
	for n > 0 {
531
		d := uint32(digitVal(s.ch))
532
		if d >= base {
533
			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
534
			if s.ch == eof {
535
				msg = "escape sequence not terminated"
536
			}
537
			s.onError(off, msg)
538
			return
539
		}
540
		x = x*base + d
541
		s.next()
542
		n--
543
	}
544

545
	if x > max || x >= 0xD800 && x < 0xE000 {
546
		s.onError(off, "escape sequence is invalid Unicode code point")
547
	}
548
}
549

550
func digitVal(ch rune) int {
551
	switch {
552
	case ch >= '0' && ch <= '9':
553
		return int(ch - '0')
554
	case lower(ch) >= 'a' && lower(ch) <= 'f':
555
		return int(lower(ch) - 'a' + 10)
556
	}
557
	return 16 // Larger than any legal digit val
558
}
559

560
func (s *Scanner) scanComment() string {
561
	// The initial character in the comment was already consumed from the scanner
562
	// forcing progress.
563
	//
564
	// slashComment will be true when the comment is a //- or /*-style comment.
565

566
	var (
567
		off   = s.offset - 1 // Offset of initial character
568
		numCR = 0
569

570
		blockComment = false
571
	)
572

573
	if s.ch == '/' { // NOTE: s.ch is second character in comment sequence
574
		// //-style comment.
575
		//
576
		// The final '\n' is not considered to be part of the comment.
577
		if s.ch == '/' {
578
			s.next() // Consume second '/'
579
		}
580

581
		for s.ch != '\n' && s.ch != eof {
582
			if s.ch == '\r' {
583
				numCR++
584
			}
585
			s.next()
586
		}
587

588
		goto exit
589
	}
590

591
	// /*-style comment.
592
	blockComment = true
593
	s.next()
594
	for s.ch != eof {
595
		ch := s.ch
596
		if ch == '\r' {
597
			numCR++
598
		}
599
		s.next()
600
		if ch == '*' && s.ch == '/' {
601
			s.next()
602
			goto exit
603
		}
604
	}
605

606
	s.onError(off, "block comment not terminated")
607

608
exit:
609
	lit := s.input[off:s.offset]
610

611
	// On Windows, a single comment line may end in "\r\n". We want to remove the
612
	// final \r.
613
	if numCR > 0 && len(lit) >= 1 && lit[len(lit)-1] == '\r' {
614
		lit = lit[:len(lit)-1]
615
		numCR--
616
	}
617

618
	if numCR > 0 {
619
		lit = stripCR(lit, blockComment)
620
	}
621

622
	return string(lit)
623
}
624

625
func stripCR(b []byte, blockComment bool) []byte {
626
	c := make([]byte, len(b))
627
	i := 0
628

629
	for j, ch := range b {
630
		if ch != '\r' || blockComment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
631
			c[i] = ch
632
			i++
633
		}
634
	}
635

636
	return c[:i]
637
}
638

639
// findLineEnd checks to see if a comment runs to the end of the line.
640
func (s *Scanner) findLineEnd() bool {
641
	// NOTE: initial '/' is already consumed by forcing the scanner to progress.
642

643
	defer func(off int) {
644
		// Reset scanner state to where it was upon calling findLineEnd.
645
		s.ch = '/'
646
		s.offset = off
647
		s.readOffset = off + 1
648
		s.next() // Consume initial starting '/' again
649
	}(s.offset - 1)
650

651
	// Read ahead until a newline, EOF, or non-comment token is found.
652
	// We loop to consume multiple sequences of comment tokens.
653
	for s.ch == '/' || s.ch == '*' {
654
		if s.ch == '/' {
655
			// //-style comments always contain newlines.
656
			return true
657
		}
658

659
		// We're looking at a /*-style comment; look for its newline.
660
		s.next()
661
		for s.ch != eof {
662
			ch := s.ch
663
			if ch == '\n' {
664
				return true
665
			}
666
			s.next()
667
			if ch == '*' && s.ch == '/' { // End of block comment
668
				s.next()
669
				break
670
			}
671
		}
672

673
		// Check to see if there's a newline after the block comment.
674
		s.skipWhitespace() // s.insertTerm is set
675
		if s.ch == eof || s.ch == '\n' {
676
			return true
677
		}
678
		if s.ch != '/' {
679
			// Non-comment token
680
			return false
681
		}
682
		s.next() // Consume '/' at the end of the /* style-comment
683
	}
684

685
	return false
686
}
687

688
// switch2 returns a if s.ch is next, b otherwise. The scanner will be advanced
689
// if b is returned.
690
//
691
// This is used for tokens which can either be a single character but also are
692
// the starting character for a 2-length token (i.e., = and ==).
693
func (s *Scanner) switch2(a, b token.Token, next rune) token.Token { //nolint:unparam
694
	if s.ch == next {
695
		s.next()
696
		return b
697
	}
698
	return a
699
}
700

701
Product

Resources

Company