Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
aos
GitHub Repository: aos/grafana-agent
Path: blob/main/pkg/river/scanner/scanner.go
4096 views
1
// Package scanner implements a lexical scanner for River source files.
2
package scanner
3
4
import (
5
"fmt"
6
"unicode"
7
"unicode/utf8"
8
9
"github.com/grafana/agent/pkg/river/token"
10
)
11
12
// EBNF for the scanner:
13
//
14
// letter = /* any unicode letter class character */ | "_"
15
// number = /* any unicode number class character */
16
// digit = /* ASCII characters 0 through 9 */
17
// digits = digit { digit }
18
// string_character = /* any unicode character that isn't '"' */
19
//
20
// COMMENT = line_comment | block_comment
21
// line_comment = "//" { character }
22
// block_comment = "/*" { character | newline } "*/"
23
//
24
// IDENT = letter { letter | number }
25
// NULL = "null"
26
// BOOL = "true" | "false"
27
// NUMBER = digits
28
// FLOAT = ( digits | "." digits ) [ "e" [ "+" | "-" ] digits ]
29
// STRING = '"' { string_character | escape_sequence } '"'
30
// OR = "||"
31
// AND = "&&"
32
// NOT = "!"
33
// NEQ = "!="
34
// ASSIGN = "="
35
// EQ = "=="
36
// LT = "<"
37
// LTE = "<="
38
// GT = ">"
39
// GTE = ">="
40
// ADD = "+"
41
// SUB = "-"
42
// MUL = "*"
43
// DIV = "/"
44
// MOD = "%"
45
// POW = "^"
46
// LCURLY = "{"
47
// RCURLY = "}"
48
// LPAREN = "("
49
// RPAREN = ")"
50
// LBRACK = "["
51
// RBRACK = "]"
52
// COMMA = ","
53
// DOT = "."
54
//
55
// The EBNF for escape_sequence is currently undocumented; see scanEscape for
56
// details. The escape sequences supported by River are the same as the escape
57
// sequences supported by Go, except that it is always valid to use \' in
58
// strings (which in Go, is only valid to use in character literals).
59
60
// ErrorHandler is invoked whenever there is an error.
61
type ErrorHandler func(pos token.Pos, msg string)
62
63
// Mode is a set of bitwise flags which control scanner behavior.
64
type Mode uint
65
66
const (
67
// IncludeComments will cause comments to be returned as comment tokens.
68
// Otherwise, comments are ignored.
69
IncludeComments Mode = 1 << iota
70
71
// Avoids automatic insertion of terminators (for testing only).
72
dontInsertTerms
73
)
74
75
const (
76
bom = 0xFEFF // byte order mark, permitted as very first character
77
eof = -1 // end of file
78
)
79
80
// Scanner holds the internal state for the tokenizer while processing configs.
81
type Scanner struct {
82
file *token.File // Config file handle for tracking line offsets
83
input []byte // Input config
84
err ErrorHandler // Error reporting (may be nil)
85
mode Mode
86
87
// scanning state variables:
88
89
ch rune // Current character
90
offset int // Byte offset of ch
91
readOffset int // Byte offset of first character *after* ch
92
insertTerm bool // Insert a newline before the next newline
93
numErrors int // Number of errors encountered during scanning
94
}
95
96
// New creates a new scanner to tokenize the provided input config. The scanner
97
// uses the provided file for adding line information for each token. The mode
98
// parameter customizes scanner behavior.
99
//
100
// Calls to Scan will invoke the error handler eh when a lexical error is found
101
// if eh is not nil.
102
func New(file *token.File, input []byte, eh ErrorHandler, mode Mode) *Scanner {
103
s := &Scanner{
104
file: file,
105
input: input,
106
err: eh,
107
mode: mode,
108
}
109
110
// Preload first character.
111
s.next()
112
if s.ch == bom {
113
s.next() // Ignore BOM if it's the first character.
114
}
115
return s
116
}
117
118
// peek gets the next byte after the current character without advancing the
119
// scanner. Returns 0 if the scanner is at EOF.
120
func (s *Scanner) peek() byte {
121
if s.readOffset < len(s.input) {
122
return s.input[s.readOffset]
123
}
124
return 0
125
}
126
127
// next advances the scanner and reads the next Unicode character into s.ch.
128
// s.ch == eof indicates end of file.
129
func (s *Scanner) next() {
130
if s.readOffset >= len(s.input) {
131
s.offset = len(s.input)
132
if s.ch == '\n' {
133
// Make sure we track final newlines at the end of the file
134
s.file.AddLine(s.offset)
135
}
136
s.ch = eof
137
return
138
}
139
140
s.offset = s.readOffset
141
if s.ch == '\n' {
142
s.file.AddLine(s.offset)
143
}
144
145
r, width := rune(s.input[s.readOffset]), 1
146
switch {
147
case r == 0:
148
s.onError(s.offset, "illegal character NUL")
149
case r >= utf8.RuneSelf:
150
r, width = utf8.DecodeRune(s.input[s.readOffset:])
151
if r == utf8.RuneError && width == 1 {
152
s.onError(s.offset, "illegal UTF-8 encoding")
153
} else if r == bom && s.offset > 0 {
154
s.onError(s.offset, "illegal byte order mark")
155
}
156
}
157
s.readOffset += width
158
s.ch = r
159
}
160
161
func (s *Scanner) onError(offset int, msg string) {
162
if s.err != nil {
163
s.err(s.file.Pos(offset), msg)
164
}
165
s.numErrors++
166
}
167
168
// NumErrors returns the current number of errors encountered during scanning.
169
// This is useful as a fallback to detect errors when no ErrorHandler was
170
// provided to the scanner.
171
func (s *Scanner) NumErrors() int { return s.numErrors }
172
173
// Scan scans the next token and returns the token's position, the token
174
// itself, and the token's literal string (when applicable). The end of the
175
// input is indicated by token.EOF.
176
//
177
// If the returned token is a literal (such as token.STRING), then lit contains
178
// the corresponding literal text (including surrounding quotes).
179
//
180
// If the returned token is a keyword, lit is the keyword text that was
181
// scanned.
182
//
183
// If the returned token is token.TERMINATOR, lit will contain "\n".
184
//
185
// If the returned token is token.ILLEGAL, lit contains the offending
186
// character.
187
//
188
// In all other cases, lit will be an empty string.
189
//
190
// For more tolerant parsing, Scan returns a valid token character whenever
191
// possible when a syntax error was encountered. Callers must check NumErrors
192
// or the number of times the provided ErrorHandler was invoked to ensure there
193
// were no errors found during scanning.
194
//
195
// Scan will inject line information to the file provided by NewScanner.
196
// Returned token positions are relative to that file.
197
func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
198
scanAgain:
199
s.skipWhitespace()
200
201
// Start of current token.
202
pos = s.file.Pos(s.offset)
203
204
var insertTerm bool
205
206
// Determine token value
207
switch ch := s.ch; {
208
case isLetter(ch):
209
lit = s.scanIdentifier()
210
if len(lit) > 1 { // Keywords are always > 1 char
211
tok = token.Lookup(lit)
212
switch tok {
213
case token.IDENT, token.NULL, token.BOOL:
214
insertTerm = true
215
}
216
} else {
217
insertTerm = true
218
tok = token.IDENT
219
}
220
221
case isDecimal(ch) || (ch == '.' && isDecimal(rune(s.peek()))):
222
insertTerm = true
223
tok, lit = s.scanNumber()
224
225
default:
226
s.next() // Make progress
227
228
// ch is now the first character in a sequence and s.ch is the second
229
// character.
230
231
switch ch {
232
case eof:
233
if s.insertTerm {
234
s.insertTerm = false // Consumed EOF
235
return pos, token.TERMINATOR, "\n"
236
}
237
tok = token.EOF
238
239
case '\n':
240
// This case is only reachable when s.insertTerm is true, since otherwise
241
// skipWhitespace consumes all other newlines.
242
s.insertTerm = false // Consumed newline
243
return pos, token.TERMINATOR, "\n"
244
245
case '\'':
246
s.onError(pos.Offset(), "illegal single-quoted string; use double quotes")
247
insertTerm = true
248
tok = token.ILLEGAL
249
lit = s.scanString('\'')
250
251
case '"':
252
insertTerm = true
253
tok = token.STRING
254
lit = s.scanString('"')
255
256
case '|':
257
if s.ch != '|' {
258
s.onError(s.offset, "missing second | in ||")
259
} else {
260
s.next() // consume second '|'
261
}
262
tok = token.OR
263
case '&':
264
if s.ch != '&' {
265
s.onError(s.offset, "missing second & in &&")
266
} else {
267
s.next() // consume second '&'
268
}
269
tok = token.AND
270
271
case '!': // !, !=
272
tok = s.switch2(token.NOT, token.NEQ, '=')
273
case '=': // =, ==
274
tok = s.switch2(token.ASSIGN, token.EQ, '=')
275
case '<': // <, <=
276
tok = s.switch2(token.LT, token.LTE, '=')
277
case '>': // >, >=
278
tok = s.switch2(token.GT, token.GTE, '=')
279
case '+':
280
tok = token.ADD
281
case '-':
282
tok = token.SUB
283
case '*':
284
tok = token.MUL
285
case '/':
286
if s.ch == '/' || s.ch == '*' {
287
// //- or /*-style comment.
288
//
289
// If we're expected to inject a terminator, we can only do so if our
290
// comment goes to the end of the line. Otherwise, the terminator will
291
// have to be injected after the comment token.
292
if s.insertTerm && s.findLineEnd() {
293
// Reset position to the beginning of the comment.
294
s.ch = '/'
295
s.offset = pos.Offset()
296
s.readOffset = s.offset + 1
297
s.insertTerm = false // Consumed newline
298
return pos, token.TERMINATOR, "\n"
299
}
300
comment := s.scanComment()
301
if s.mode&IncludeComments == 0 {
302
// Skip over comment
303
s.insertTerm = false // Consumed newline
304
goto scanAgain
305
}
306
tok = token.COMMENT
307
lit = comment
308
} else {
309
tok = token.DIV
310
}
311
312
case '%':
313
tok = token.MOD
314
case '^':
315
tok = token.POW
316
case '{':
317
tok = token.LCURLY
318
case '}':
319
insertTerm = true
320
tok = token.RCURLY
321
case '(':
322
tok = token.LPAREN
323
case ')':
324
insertTerm = true
325
tok = token.RPAREN
326
case '[':
327
tok = token.LBRACK
328
case ']':
329
insertTerm = true
330
tok = token.RBRACK
331
case ',':
332
tok = token.COMMA
333
case '.':
334
// NOTE: Fractions starting with '.' are handled by outer switch
335
tok = token.DOT
336
337
default:
338
// s.next() reports invalid BOMs so we don't need to repeat the error.
339
if ch != bom {
340
s.onError(pos.Offset(), fmt.Sprintf("illegal character %#U", ch))
341
}
342
insertTerm = s.insertTerm // Preserve previous s.insertTerm state
343
tok = token.ILLEGAL
344
lit = string(ch)
345
}
346
}
347
348
if s.mode&dontInsertTerms == 0 {
349
s.insertTerm = insertTerm
350
}
351
return
352
}
353
354
func (s *Scanner) skipWhitespace() {
355
for s.ch == ' ' || s.ch == '\t' || s.ch == '\r' || (s.ch == '\n' && !s.insertTerm) {
356
s.next()
357
}
358
}
359
360
func isLetter(ch rune) bool {
361
// We check for ASCII first as an optimization, and leave checking unicode
362
// (the slowest) to the very end.
363
return (lower(ch) >= 'a' && lower(ch) <= 'z') ||
364
ch == '_' ||
365
(ch >= utf8.RuneSelf && unicode.IsLetter(ch))
366
}
367
368
func lower(ch rune) rune { return ('a' - 'A') | ch }
369
func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
370
func isDigit(ch rune) bool {
371
return isDecimal(ch) || (ch >= utf8.RuneSelf && unicode.IsDigit(ch))
372
}
373
374
// scanIdentifier reads the string of valid identifier characters starting at
375
// s.offet. It must only be called when s.ch is a valid character which starts
376
// an identifier.
377
//
378
// scanIdentifier is highly optimized for identifiers are modifications must be
379
// made carefully.
380
func (s *Scanner) scanIdentifier() string {
381
off := s.offset
382
383
// Optimize for common case of ASCII identifiers.
384
//
385
// Ranging over s.input[s.readOffset:] avoids bounds checks and avoids
386
// conversions to runes.
387
//
388
// We'll fall back to the slower path if we find a non-ASCII character.
389
for readOffset, b := range s.input[s.readOffset:] {
390
if (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' || (b >= '0' && b <= '9') {
391
// Common case: ASCII character; don't assign a rune.
392
continue
393
}
394
s.readOffset += readOffset
395
if b > 0 && b < utf8.RuneSelf {
396
// Optimization: ASCII character that isn't a letter or number; we've
397
// reached the end of the identifier sequence and can terminate. We avoid
398
// the call to s.next() and the corresponding setup.
399
//
400
// This optimization only works because we know that s.ch (the current
401
// character when scanIdentifier was called) is never '\n' since '\n'
402
// cannot start an identifier.
403
s.ch = rune(b)
404
s.offset = s.readOffset
405
s.readOffset++
406
goto exit
407
}
408
409
// The preceding character is valid for an identifier because
410
// scanIdentifier is only called when s.ch is a letter; calling s.next() at
411
// s.readOffset will reset the scanner state.
412
s.next()
413
for isLetter(s.ch) || isDigit(s.ch) {
414
s.next()
415
}
416
417
// No more valid characters for the identifier; terminate.
418
goto exit
419
}
420
421
s.offset = len(s.input)
422
s.readOffset = len(s.input)
423
s.ch = eof
424
425
exit:
426
return string(s.input[off:s.offset])
427
}
428
429
func (s *Scanner) scanNumber() (tok token.Token, lit string) {
430
tok = token.NUMBER
431
off := s.offset
432
433
// Integer part of number
434
if s.ch != '.' {
435
s.digits()
436
}
437
438
// Fractional part of number
439
if s.ch == '.' {
440
tok = token.FLOAT
441
442
s.next()
443
s.digits()
444
}
445
446
// Exponent
447
if lower(s.ch) == 'e' {
448
tok = token.FLOAT
449
450
s.next()
451
if s.ch == '+' || s.ch == '-' {
452
s.next()
453
}
454
455
if s.digits() == 0 {
456
s.onError(off, "exponent has no digits")
457
}
458
}
459
460
return tok, string(s.input[off:s.offset])
461
}
462
463
// digits scans a sequence of digits.
464
func (s *Scanner) digits() (count int) {
465
for isDecimal(s.ch) {
466
s.next()
467
count++
468
}
469
return
470
}
471
472
func (s *Scanner) scanString(until rune) string {
473
// subtract 1 to account for the opening '"' which was already consumed by
474
// the scanner forcing progress.
475
off := s.offset - 1
476
477
for {
478
ch := s.ch
479
if ch == '\n' || ch == eof {
480
s.onError(off, "string literal not terminated")
481
break
482
}
483
s.next()
484
if ch == until {
485
break
486
}
487
if ch == '\\' {
488
s.scanEscape()
489
}
490
}
491
492
return string(s.input[off:s.offset])
493
}
494
495
// scanEscape parses an escape sequence. In case of a syntax error, scanEscape
496
// stops at the offending character without consuming it.
497
func (s *Scanner) scanEscape() {
498
off := s.offset
499
500
var (
501
n int
502
base, max uint32
503
)
504
505
switch s.ch {
506
case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', '"':
507
s.next()
508
return
509
case '0', '1', '2', '3', '4', '5', '6', '7':
510
n, base, max = 3, 8, 255
511
case 'x':
512
s.next()
513
n, base, max = 2, 16, 255
514
case 'u':
515
s.next()
516
n, base, max = 4, 16, unicode.MaxRune
517
case 'U':
518
s.next()
519
n, base, max = 8, 16, unicode.MaxRune
520
default:
521
msg := "unknown escape sequence"
522
if s.ch == eof {
523
msg = "escape sequence not terminated"
524
}
525
s.onError(off, msg)
526
return
527
}
528
529
var x uint32
530
for n > 0 {
531
d := uint32(digitVal(s.ch))
532
if d >= base {
533
msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
534
if s.ch == eof {
535
msg = "escape sequence not terminated"
536
}
537
s.onError(off, msg)
538
return
539
}
540
x = x*base + d
541
s.next()
542
n--
543
}
544
545
if x > max || x >= 0xD800 && x < 0xE000 {
546
s.onError(off, "escape sequence is invalid Unicode code point")
547
}
548
}
549
550
func digitVal(ch rune) int {
551
switch {
552
case ch >= '0' && ch <= '9':
553
return int(ch - '0')
554
case lower(ch) >= 'a' && lower(ch) <= 'f':
555
return int(lower(ch) - 'a' + 10)
556
}
557
return 16 // Larger than any legal digit val
558
}
559
560
func (s *Scanner) scanComment() string {
561
// The initial character in the comment was already consumed from the scanner
562
// forcing progress.
563
//
564
// slashComment will be true when the comment is a //- or /*-style comment.
565
566
var (
567
off = s.offset - 1 // Offset of initial character
568
numCR = 0
569
570
blockComment = false
571
)
572
573
if s.ch == '/' { // NOTE: s.ch is second character in comment sequence
574
// //-style comment.
575
//
576
// The final '\n' is not considered to be part of the comment.
577
if s.ch == '/' {
578
s.next() // Consume second '/'
579
}
580
581
for s.ch != '\n' && s.ch != eof {
582
if s.ch == '\r' {
583
numCR++
584
}
585
s.next()
586
}
587
588
goto exit
589
}
590
591
// /*-style comment.
592
blockComment = true
593
s.next()
594
for s.ch != eof {
595
ch := s.ch
596
if ch == '\r' {
597
numCR++
598
}
599
s.next()
600
if ch == '*' && s.ch == '/' {
601
s.next()
602
goto exit
603
}
604
}
605
606
s.onError(off, "block comment not terminated")
607
608
exit:
609
lit := s.input[off:s.offset]
610
611
// On Windows, a single comment line may end in "\r\n". We want to remove the
612
// final \r.
613
if numCR > 0 && len(lit) >= 1 && lit[len(lit)-1] == '\r' {
614
lit = lit[:len(lit)-1]
615
numCR--
616
}
617
618
if numCR > 0 {
619
lit = stripCR(lit, blockComment)
620
}
621
622
return string(lit)
623
}
624
625
func stripCR(b []byte, blockComment bool) []byte {
626
c := make([]byte, len(b))
627
i := 0
628
629
for j, ch := range b {
630
if ch != '\r' || blockComment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
631
c[i] = ch
632
i++
633
}
634
}
635
636
return c[:i]
637
}
638
639
// findLineEnd checks to see if a comment runs to the end of the line.
640
func (s *Scanner) findLineEnd() bool {
641
// NOTE: initial '/' is already consumed by forcing the scanner to progress.
642
643
defer func(off int) {
644
// Reset scanner state to where it was upon calling findLineEnd.
645
s.ch = '/'
646
s.offset = off
647
s.readOffset = off + 1
648
s.next() // Consume initial starting '/' again
649
}(s.offset - 1)
650
651
// Read ahead until a newline, EOF, or non-comment token is found.
652
// We loop to consume multiple sequences of comment tokens.
653
for s.ch == '/' || s.ch == '*' {
654
if s.ch == '/' {
655
// //-style comments always contain newlines.
656
return true
657
}
658
659
// We're looking at a /*-style comment; look for its newline.
660
s.next()
661
for s.ch != eof {
662
ch := s.ch
663
if ch == '\n' {
664
return true
665
}
666
s.next()
667
if ch == '*' && s.ch == '/' { // End of block comment
668
s.next()
669
break
670
}
671
}
672
673
// Check to see if there's a newline after the block comment.
674
s.skipWhitespace() // s.insertTerm is set
675
if s.ch == eof || s.ch == '\n' {
676
return true
677
}
678
if s.ch != '/' {
679
// Non-comment token
680
return false
681
}
682
s.next() // Consume '/' at the end of the /* style-comment
683
}
684
685
return false
686
}
687
688
// switch2 returns a if s.ch is next, b otherwise. The scanner will be advanced
689
// if b is returned.
690
//
691
// This is used for tokens which can either be a single character but also are
692
// the starting character for a 2-length token (i.e., = and ==).
693
func (s *Scanner) switch2(a, b token.Token, next rune) token.Token { //nolint:unparam
694
if s.ch == next {
695
s.next()
696
return b
697
}
698
return a
699
}
700
701