Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Roblox
GitHub Repository: Roblox/luau
Path: blob/master/Ast/src/Lexer.cpp
2725 views
1
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
2
#include "Luau/Lexer.h"
3
4
#include "Luau/Allocator.h"
5
#include "Luau/Common.h"
6
#include "Luau/Confusables.h"
7
#include "Luau/StringUtils.h"
8
9
#include <limits.h>
10
11
namespace Luau
12
{
13
14
Lexeme::Lexeme(const Location& location, Type type)
15
: type(type)
16
, location(location)
17
, length(0)
18
, data(nullptr)
19
{
20
}
21
22
Lexeme::Lexeme(const Location& location, char character)
23
: type(static_cast<Type>(static_cast<unsigned char>(character)))
24
, location(location)
25
, length(0)
26
, data(nullptr)
27
{
28
}
29
30
Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t size)
31
: type(type)
32
, location(location)
33
, length(unsigned(size))
34
, data(data)
35
{
36
LUAU_ASSERT(
37
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
38
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
39
);
40
}
41
42
Lexeme::Lexeme(const Location& location, Type type, const char* name)
43
: type(type)
44
, location(location)
45
, length(0)
46
, name(name)
47
{
48
LUAU_ASSERT(type == Name || type == Attribute || (type >= Reserved_BEGIN && type < Lexeme::Reserved_END));
49
}
50
51
unsigned int Lexeme::getLength() const
52
{
53
LUAU_ASSERT(
54
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
55
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
56
);
57
58
return length;
59
}
60
61
static const char* kReserved[] = {"and", "break", "do", "else", "elseif", "end", "false", "for", "function", "if", "in",
62
"local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while"};
63
64
std::string Lexeme::toString() const
65
{
66
switch (type)
67
{
68
case Eof:
69
return "<eof>";
70
71
case Equal:
72
return "'=='";
73
74
case LessEqual:
75
return "'<='";
76
77
case GreaterEqual:
78
return "'>='";
79
80
case NotEqual:
81
return "'~='";
82
83
case Dot2:
84
return "'..'";
85
86
case Dot3:
87
return "'...'";
88
89
case SkinnyArrow:
90
return "'->'";
91
92
case DoubleColon:
93
return "'::'";
94
95
case FloorDiv:
96
return "'//'";
97
98
case AddAssign:
99
return "'+='";
100
101
case SubAssign:
102
return "'-='";
103
104
case MulAssign:
105
return "'*='";
106
107
case DivAssign:
108
return "'/='";
109
110
case FloorDivAssign:
111
return "'//='";
112
113
case ModAssign:
114
return "'%='";
115
116
case PowAssign:
117
return "'^='";
118
119
case ConcatAssign:
120
return "'..='";
121
122
case RawString:
123
case QuotedString:
124
return data ? format("\"%.*s\"", length, data) : "string";
125
126
case InterpStringBegin:
127
return data ? format("`%.*s{", length, data) : "the beginning of an interpolated string";
128
129
case InterpStringMid:
130
return data ? format("}%.*s{", length, data) : "the middle of an interpolated string";
131
132
case InterpStringEnd:
133
return data ? format("}%.*s`", length, data) : "the end of an interpolated string";
134
135
case InterpStringSimple:
136
return data ? format("`%.*s`", length, data) : "interpolated string";
137
138
case Number:
139
return data ? format("'%.*s'", length, data) : "number";
140
141
case Name:
142
return name ? format("'%s'", name) : "identifier";
143
144
case Comment:
145
return "comment";
146
147
case Attribute:
148
return name ? format("'%s'", name) : "attribute";
149
150
case AttributeOpen:
151
return "'@['";
152
153
case BrokenString:
154
return "malformed string";
155
156
case BrokenComment:
157
return "unfinished comment";
158
159
case BrokenInterpDoubleBrace:
160
return "'{{', which is invalid (did you mean '\\{'?)";
161
162
case BrokenUnicode:
163
if (codepoint)
164
{
165
if (const char* confusable = findConfusable(codepoint))
166
return format("Unicode character U+%x (did you mean '%s'?)", codepoint, confusable);
167
168
return format("Unicode character U+%x", codepoint);
169
}
170
else
171
{
172
return "invalid UTF-8 sequence";
173
}
174
175
default:
176
if (type < Char_END)
177
return format("'%c'", type);
178
else if (type >= Reserved_BEGIN && type < Reserved_END)
179
return format("'%s'", kReserved[type - Reserved_BEGIN]);
180
else
181
return "<unknown>";
182
}
183
}
184
185
bool AstNameTable::Entry::operator==(const Entry& other) const
186
{
187
return length == other.length && memcmp(value.value, other.value.value, length) == 0;
188
}
189
190
size_t AstNameTable::EntryHash::operator()(const Entry& e) const
191
{
192
// FNV1a
193
uint32_t hash = 2166136261;
194
195
for (size_t i = 0; i < e.length; ++i)
196
{
197
hash ^= uint8_t(e.value.value[i]);
198
hash *= 16777619;
199
}
200
201
return hash;
202
}
203
204
AstNameTable::AstNameTable(Allocator& allocator)
205
: data({AstName(""), 0, Lexeme::Eof}, 128)
206
, allocator(allocator)
207
{
208
static_assert(sizeof(kReserved) / sizeof(kReserved[0]) == Lexeme::Reserved_END - Lexeme::Reserved_BEGIN);
209
210
for (int i = Lexeme::Reserved_BEGIN; i < Lexeme::Reserved_END; ++i)
211
addStatic(kReserved[i - Lexeme::Reserved_BEGIN], static_cast<Lexeme::Type>(i));
212
}
213
214
AstName AstNameTable::addStatic(const char* name, Lexeme::Type type)
215
{
216
AstNameTable::Entry entry = {AstName(name), uint32_t(strlen(name)), type};
217
218
LUAU_ASSERT(!data.contains(entry));
219
data.insert(entry);
220
221
return entry.value;
222
}
223
224
std::pair<AstName, Lexeme::Type> AstNameTable::getOrAddWithType(const char* name, size_t length)
225
{
226
AstNameTable::Entry key = {AstName(name), uint32_t(length), Lexeme::Eof};
227
const Entry& entry = data.insert(key);
228
229
// entry already was inserted
230
if (entry.type != Lexeme::Eof)
231
return std::make_pair(entry.value, entry.type);
232
233
// we just inserted an entry with a non-owned pointer into the map
234
// we need to correct it, *but* we need to be careful about not disturbing the hash value
235
char* nameData = static_cast<char*>(allocator.allocate(length + 1));
236
memcpy(nameData, name, length);
237
nameData[length] = 0;
238
239
const_cast<Entry&>(entry).value = AstName(nameData);
240
const_cast<Entry&>(entry).type = (name[0] == '@' ? Lexeme::Attribute : Lexeme::Name);
241
242
return std::make_pair(entry.value, entry.type);
243
}
244
245
std::pair<AstName, Lexeme::Type> AstNameTable::getWithType(const char* name, size_t length) const
246
{
247
if (const Entry* entry = data.find({AstName(name), uint32_t(length), Lexeme::Eof}))
248
{
249
return std::make_pair(entry->value, entry->type);
250
}
251
return std::make_pair(AstName(), Lexeme::Name);
252
}
253
254
AstName AstNameTable::getOrAdd(const char* name, size_t len)
255
{
256
return getOrAddWithType(name, len).first;
257
}
258
259
AstName AstNameTable::getOrAdd(const char* name)
260
{
261
return getOrAddWithType(name, strlen(name)).first;
262
}
263
264
AstName AstNameTable::get(const char* name) const
265
{
266
return getWithType(name, strlen(name)).first;
267
}
268
269
inline bool isAlpha(char ch)
270
{
271
// use or trick to convert to lower case and unsigned comparison to do range check
272
return unsigned((ch | ' ') - 'a') < 26;
273
}
274
275
inline bool isDigit(char ch)
276
{
277
return unsigned(ch - '0') < 10;
278
}
279
280
inline bool isHexDigit(char ch)
281
{
282
// use or trick to convert to lower case and unsigned comparison to do range check
283
return unsigned(ch - '0') < 10 || unsigned((ch | ' ') - 'a') < 6;
284
}
285
286
inline bool isNewline(char ch)
287
{
288
return ch == '\n';
289
}
290
291
static char unescape(char ch)
292
{
293
switch (ch)
294
{
295
case 'a':
296
return '\a';
297
case 'b':
298
return '\b';
299
case 'f':
300
return '\f';
301
case 'n':
302
return '\n';
303
case 'r':
304
return '\r';
305
case 't':
306
return '\t';
307
case 'v':
308
return '\v';
309
default:
310
return ch;
311
}
312
}
313
314
unsigned int Lexeme::getBlockDepth() const
315
{
316
LUAU_ASSERT(type == Lexeme::RawString || type == Lexeme::BlockComment);
317
318
// If we have a well-formed string, we are guaranteed to see 2 `]` characters after the end of the string contents
319
LUAU_ASSERT(*(data + length) == ']');
320
unsigned int depth = 0;
321
do
322
{
323
depth++;
324
} while (*(data + length + depth) != ']');
325
326
return depth - 1;
327
}
328
329
Lexeme::QuoteStyle Lexeme::getQuoteStyle() const
330
{
331
LUAU_ASSERT(type == Lexeme::QuotedString);
332
333
// If we have a well-formed string, we are guaranteed to see a closing delimiter after the string
334
LUAU_ASSERT(data);
335
336
char quote = *(data + length);
337
if (quote == '\'')
338
return Lexeme::QuoteStyle::Single;
339
else if (quote == '"')
340
return Lexeme::QuoteStyle::Double;
341
342
LUAU_ASSERT(!"Unknown quote style");
343
return Lexeme::QuoteStyle::Double; // unreachable, but required due to compiler warning
344
}
345
346
Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names, Position startPosition)
347
: buffer(buffer)
348
, bufferSize(bufferSize)
349
, offset(0)
350
, line(startPosition.line)
351
, lineOffset(0u - startPosition.column)
352
, lexeme((Location(Position(startPosition.line, startPosition.column), 0)), Lexeme::Eof)
353
, names(names)
354
, skipComments(false)
355
, readNames(true)
356
{
357
}
358
359
void Lexer::setSkipComments(bool skip)
360
{
361
skipComments = skip;
362
}
363
364
void Lexer::setReadNames(bool read)
365
{
366
readNames = read;
367
}
368
369
const Lexeme& Lexer::next()
370
{
371
return next(this->skipComments, true);
372
}
373
374
const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation)
375
{
376
// in skipComments mode we reject valid comments
377
do
378
{
379
// consume whitespace before the token
380
while (isSpace(peekch()))
381
consumeAny();
382
383
if (updatePrevLocation)
384
prevLocation = lexeme.location;
385
386
lexeme = readNext();
387
updatePrevLocation = false;
388
} while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment));
389
390
return lexeme;
391
}
392
393
void Lexer::nextline()
394
{
395
while (peekch() != 0 && peekch() != '\r' && !isNewline(peekch()))
396
consume();
397
398
next();
399
}
400
401
Lexeme Lexer::lookahead()
402
{
403
unsigned int currentOffset = offset;
404
unsigned int currentLine = line;
405
unsigned int currentLineOffset = lineOffset;
406
Lexeme currentLexeme = lexeme;
407
Location currentPrevLocation = prevLocation;
408
size_t currentBraceStackSize = braceStack.size();
409
BraceType currentBraceType = braceStack.empty() ? BraceType::Normal : braceStack.back();
410
411
Lexeme result = next();
412
413
offset = currentOffset;
414
line = currentLine;
415
lineOffset = currentLineOffset;
416
lexeme = currentLexeme;
417
prevLocation = currentPrevLocation;
418
419
if (braceStack.size() < currentBraceStackSize)
420
braceStack.push_back(currentBraceType);
421
else if (braceStack.size() > currentBraceStackSize)
422
braceStack.pop_back();
423
424
return result;
425
}
426
427
bool Lexer::isReserved(const std::string& word)
428
{
429
for (int i = Lexeme::Reserved_BEGIN; i < Lexeme::Reserved_END; ++i)
430
if (word == kReserved[i - Lexeme::Reserved_BEGIN])
431
return true;
432
433
return false;
434
}
435
436
LUAU_FORCEINLINE
437
char Lexer::peekch() const
438
{
439
return (offset < bufferSize) ? buffer[offset] : 0;
440
}
441
442
LUAU_FORCEINLINE
443
char Lexer::peekch(unsigned int lookahead) const
444
{
445
return (offset + lookahead < bufferSize) ? buffer[offset + lookahead] : 0;
446
}
447
448
LUAU_FORCEINLINE
449
Position Lexer::position() const
450
{
451
return Position(line, offset - lineOffset);
452
}
453
454
LUAU_FORCEINLINE
455
void Lexer::consume()
456
{
457
// consume() assumes current character is known to not be a newline; use consumeAny if this is not guaranteed
458
LUAU_ASSERT(!isNewline(buffer[offset]));
459
460
offset++;
461
}
462
463
LUAU_FORCEINLINE
464
void Lexer::consumeAny()
465
{
466
if (isNewline(buffer[offset]))
467
{
468
line++;
469
lineOffset = offset + 1;
470
}
471
472
offset++;
473
}
474
475
Lexeme Lexer::readCommentBody()
476
{
477
Position start = position();
478
479
LUAU_ASSERT(peekch(0) == '-' && peekch(1) == '-');
480
consume();
481
consume();
482
483
size_t startOffset = offset;
484
485
if (peekch() == '[')
486
{
487
int sep = skipLongSeparator();
488
489
if (sep >= 0)
490
{
491
return readLongString(start, sep, Lexeme::BlockComment, Lexeme::BrokenComment);
492
}
493
}
494
495
// fall back to single-line comment
496
while (peekch() != 0 && peekch() != '\r' && !isNewline(peekch()))
497
consume();
498
499
return Lexeme(Location(start, position()), Lexeme::Comment, &buffer[startOffset], offset - startOffset);
500
}
501
502
// Given a sequence [===[ or ]===], returns:
503
// 1. number of equal signs (or 0 if none present) between the brackets
504
// 2. -1 if this is not a long comment/string separator
505
// 3. -N if this is a malformed separator
506
// Does *not* consume the closing brace.
507
int Lexer::skipLongSeparator()
508
{
509
char start = peekch();
510
511
LUAU_ASSERT(start == '[' || start == ']');
512
consume();
513
514
int count = 0;
515
516
while (peekch() == '=')
517
{
518
consume();
519
count++;
520
}
521
522
return (start == peekch()) ? count : (-count) - 1;
523
}
524
525
Lexeme Lexer::readLongString(const Position& start, int sep, Lexeme::Type ok, Lexeme::Type broken)
526
{
527
// skip (second) [
528
LUAU_ASSERT(peekch() == '[');
529
consume();
530
531
unsigned int startOffset = offset;
532
533
while (peekch())
534
{
535
if (peekch() == ']')
536
{
537
if (skipLongSeparator() == sep)
538
{
539
LUAU_ASSERT(peekch() == ']');
540
consume(); // skip (second) ]
541
542
unsigned int endOffset = offset - sep - 2;
543
LUAU_ASSERT(endOffset >= startOffset);
544
545
return Lexeme(Location(start, position()), ok, &buffer[startOffset], endOffset - startOffset);
546
}
547
}
548
else
549
{
550
consumeAny();
551
}
552
}
553
554
return Lexeme(Location(start, position()), broken);
555
}
556
557
void Lexer::readBackslashInString()
558
{
559
LUAU_ASSERT(peekch() == '\\');
560
consume();
561
switch (peekch())
562
{
563
case '\r':
564
consume();
565
if (peekch() == '\n')
566
consumeAny();
567
break;
568
569
case 0:
570
break;
571
572
case 'z':
573
consume();
574
while (isSpace(peekch()))
575
consumeAny();
576
break;
577
578
default:
579
consumeAny();
580
}
581
}
582
583
Lexeme Lexer::readQuotedString()
584
{
585
Position start = position();
586
587
char delimiter = peekch();
588
LUAU_ASSERT(delimiter == '\'' || delimiter == '"');
589
consume();
590
591
unsigned int startOffset = offset;
592
593
while (peekch() != delimiter)
594
{
595
switch (peekch())
596
{
597
case 0:
598
case '\r':
599
case '\n':
600
return Lexeme(Location(start, position()), Lexeme::BrokenString);
601
602
case '\\':
603
readBackslashInString();
604
break;
605
606
default:
607
consume();
608
}
609
}
610
611
consume();
612
613
return Lexeme(Location(start, position()), Lexeme::QuotedString, &buffer[startOffset], offset - startOffset - 1);
614
}
615
616
Lexeme Lexer::readInterpolatedStringBegin()
617
{
618
LUAU_ASSERT(peekch() == '`');
619
620
Position start = position();
621
consume();
622
623
return readInterpolatedStringSection(start, Lexeme::InterpStringBegin, Lexeme::InterpStringSimple);
624
}
625
626
Lexeme Lexer::readInterpolatedStringSection(Position start, Lexeme::Type formatType, Lexeme::Type endType)
627
{
628
unsigned int startOffset = offset;
629
630
while (peekch() != '`')
631
{
632
switch (peekch())
633
{
634
case 0:
635
case '\r':
636
case '\n':
637
return Lexeme(Location(start, position()), Lexeme::BrokenString);
638
639
case '\\':
640
// Allow for \u{}, which would otherwise be consumed by looking for {
641
if (peekch(1) == 'u' && peekch(2) == '{')
642
{
643
consume(); // backslash
644
consume(); // u
645
consume(); // {
646
break;
647
}
648
649
readBackslashInString();
650
break;
651
652
case '{':
653
{
654
braceStack.push_back(BraceType::InterpolatedString);
655
656
if (peekch(1) == '{')
657
{
658
Lexeme brokenDoubleBrace =
659
Lexeme(Location(start, position()), Lexeme::BrokenInterpDoubleBrace, &buffer[startOffset], offset - startOffset);
660
consume();
661
consume();
662
return brokenDoubleBrace;
663
}
664
665
consume();
666
return Lexeme(Location(start, position()), formatType, &buffer[startOffset], offset - startOffset - 1);
667
}
668
669
default:
670
consume();
671
}
672
}
673
674
consume();
675
676
return Lexeme(Location(start, position()), endType, &buffer[startOffset], offset - startOffset - 1);
677
}
678
679
Lexeme Lexer::readNumber(const Position& start, unsigned int startOffset)
680
{
681
LUAU_ASSERT(isDigit(peekch()));
682
683
// This function does not do the number parsing - it only skips a number-like pattern.
684
// It uses the same logic as Lua stock lexer; the resulting string is later converted
685
// to a number with proper verification.
686
do
687
{
688
consume();
689
} while (isDigit(peekch()) || peekch() == '.' || peekch() == '_');
690
691
if (peekch() == 'e' || peekch() == 'E')
692
{
693
consume();
694
695
if (peekch() == '+' || peekch() == '-')
696
consume();
697
}
698
699
while (isAlpha(peekch()) || isDigit(peekch()) || peekch() == '_')
700
consume();
701
702
return Lexeme(Location(start, position()), Lexeme::Number, &buffer[startOffset], offset - startOffset);
703
}
704
705
std::pair<AstName, Lexeme::Type> Lexer::readName()
706
{
707
LUAU_ASSERT(isAlpha(peekch()) || peekch() == '_' || peekch() == '@');
708
709
unsigned int startOffset = offset;
710
711
do
712
consume();
713
while (isAlpha(peekch()) || isDigit(peekch()) || peekch() == '_');
714
715
return readNames ? names.getOrAddWithType(&buffer[startOffset], offset - startOffset)
716
: names.getWithType(&buffer[startOffset], offset - startOffset);
717
}
718
719
Lexeme Lexer::readNext()
720
{
721
Position start = position();
722
723
switch (peekch())
724
{
725
case 0:
726
return Lexeme(Location(start, 0), Lexeme::Eof);
727
728
case '-':
729
{
730
if (peekch(1) == '>')
731
{
732
consume();
733
consume();
734
return Lexeme(Location(start, 2), Lexeme::SkinnyArrow);
735
}
736
else if (peekch(1) == '=')
737
{
738
consume();
739
consume();
740
return Lexeme(Location(start, 2), Lexeme::SubAssign);
741
}
742
else if (peekch(1) == '-')
743
{
744
return readCommentBody();
745
}
746
else
747
{
748
consume();
749
return Lexeme(Location(start, 1), '-');
750
}
751
}
752
753
case '[':
754
{
755
int sep = skipLongSeparator();
756
757
if (sep >= 0)
758
{
759
return readLongString(start, sep, Lexeme::RawString, Lexeme::BrokenString);
760
}
761
else if (sep == -1)
762
{
763
return Lexeme(Location(start, 1), '[');
764
}
765
else
766
{
767
return Lexeme(Location(start, position()), Lexeme::BrokenString);
768
}
769
}
770
771
case '{':
772
{
773
consume();
774
775
if (!braceStack.empty())
776
braceStack.push_back(BraceType::Normal);
777
778
return Lexeme(Location(start, 1), '{');
779
}
780
781
case '}':
782
{
783
consume();
784
785
if (braceStack.empty())
786
{
787
return Lexeme(Location(start, 1), '}');
788
}
789
790
const BraceType braceStackTop = braceStack.back();
791
braceStack.pop_back();
792
793
if (braceStackTop != BraceType::InterpolatedString)
794
{
795
return Lexeme(Location(start, 1), '}');
796
}
797
798
return readInterpolatedStringSection(start, Lexeme::InterpStringMid, Lexeme::InterpStringEnd);
799
}
800
801
case '=':
802
{
803
consume();
804
805
if (peekch() == '=')
806
{
807
consume();
808
return Lexeme(Location(start, 2), Lexeme::Equal);
809
}
810
else
811
return Lexeme(Location(start, 1), '=');
812
}
813
814
case '<':
815
{
816
consume();
817
818
if (peekch() == '=')
819
{
820
consume();
821
return Lexeme(Location(start, 2), Lexeme::LessEqual);
822
}
823
else
824
return Lexeme(Location(start, 1), '<');
825
}
826
827
case '>':
828
{
829
consume();
830
831
if (peekch() == '=')
832
{
833
consume();
834
return Lexeme(Location(start, 2), Lexeme::GreaterEqual);
835
}
836
else
837
return Lexeme(Location(start, 1), '>');
838
}
839
840
case '~':
841
{
842
consume();
843
844
if (peekch() == '=')
845
{
846
consume();
847
return Lexeme(Location(start, 2), Lexeme::NotEqual);
848
}
849
else
850
return Lexeme(Location(start, 1), '~');
851
}
852
853
case '"':
854
case '\'':
855
return readQuotedString();
856
857
case '`':
858
return readInterpolatedStringBegin();
859
860
case '.':
861
consume();
862
863
if (peekch() == '.')
864
{
865
consume();
866
867
if (peekch() == '.')
868
{
869
consume();
870
871
return Lexeme(Location(start, 3), Lexeme::Dot3);
872
}
873
else if (peekch() == '=')
874
{
875
consume();
876
877
return Lexeme(Location(start, 3), Lexeme::ConcatAssign);
878
}
879
else
880
return Lexeme(Location(start, 2), Lexeme::Dot2);
881
}
882
else
883
{
884
if (isDigit(peekch()))
885
{
886
return readNumber(start, offset - 1);
887
}
888
else
889
return Lexeme(Location(start, 1), '.');
890
}
891
892
case '+':
893
consume();
894
895
if (peekch() == '=')
896
{
897
consume();
898
return Lexeme(Location(start, 2), Lexeme::AddAssign);
899
}
900
else
901
return Lexeme(Location(start, 1), '+');
902
903
case '/':
904
{
905
consume();
906
907
char ch = peekch();
908
909
if (ch == '=')
910
{
911
consume();
912
return Lexeme(Location(start, 2), Lexeme::DivAssign);
913
}
914
else if (ch == '/')
915
{
916
consume();
917
918
if (peekch() == '=')
919
{
920
consume();
921
return Lexeme(Location(start, 3), Lexeme::FloorDivAssign);
922
}
923
else
924
return Lexeme(Location(start, 2), Lexeme::FloorDiv);
925
}
926
else
927
return Lexeme(Location(start, 1), '/');
928
}
929
930
case '*':
931
consume();
932
933
if (peekch() == '=')
934
{
935
consume();
936
return Lexeme(Location(start, 2), Lexeme::MulAssign);
937
}
938
else
939
return Lexeme(Location(start, 1), '*');
940
941
case '%':
942
consume();
943
944
if (peekch() == '=')
945
{
946
consume();
947
return Lexeme(Location(start, 2), Lexeme::ModAssign);
948
}
949
else
950
return Lexeme(Location(start, 1), '%');
951
952
case '^':
953
consume();
954
955
if (peekch() == '=')
956
{
957
consume();
958
return Lexeme(Location(start, 2), Lexeme::PowAssign);
959
}
960
else
961
return Lexeme(Location(start, 1), '^');
962
963
case ':':
964
{
965
consume();
966
if (peekch() == ':')
967
{
968
consume();
969
return Lexeme(Location(start, 2), Lexeme::DoubleColon);
970
}
971
else
972
return Lexeme(Location(start, 1), ':');
973
}
974
975
case '(':
976
case ')':
977
case ']':
978
case ';':
979
case ',':
980
case '#':
981
case '?':
982
case '&':
983
case '|':
984
{
985
char ch = peekch();
986
consume();
987
988
return Lexeme(Location(start, 1), ch);
989
}
990
case '@':
991
{
992
if (peekch(1) == '[')
993
{
994
consume();
995
consume();
996
997
return Lexeme(Location(start, 2), Lexeme::AttributeOpen);
998
}
999
else
1000
{
1001
// consume @ first
1002
consume();
1003
1004
if (isAlpha(peekch()) || peekch() == '_')
1005
{
1006
std::pair<AstName, Lexeme::Type> attribute = readName();
1007
return Lexeme(Location(start, position()), Lexeme::Attribute, attribute.first.value);
1008
}
1009
else
1010
{
1011
return Lexeme(Location(start, position()), Lexeme::Attribute, "");
1012
}
1013
}
1014
}
1015
default:
1016
if (isDigit(peekch()))
1017
{
1018
return readNumber(start, offset);
1019
}
1020
else if (isAlpha(peekch()) || peekch() == '_')
1021
{
1022
std::pair<AstName, Lexeme::Type> name = readName();
1023
1024
return Lexeme(Location(start, position()), name.second, name.first.value);
1025
}
1026
else if (peekch() & 0x80)
1027
{
1028
return readUtf8Error();
1029
}
1030
else
1031
{
1032
char ch = peekch();
1033
consume();
1034
1035
return Lexeme(Location(start, 1), ch);
1036
}
1037
}
1038
}
1039
1040
std::optional<Lexer::BraceType> Lexer::peekBraceStackTop()
1041
{
1042
if (braceStack.empty())
1043
return std::nullopt;
1044
else
1045
return {braceStack.back()};
1046
}
1047
1048
LUAU_NOINLINE Lexeme Lexer::readUtf8Error()
1049
{
1050
Position start = position();
1051
uint32_t codepoint = 0;
1052
int size = 0;
1053
1054
if ((peekch() & 0b10000000) == 0b00000000)
1055
{
1056
size = 1;
1057
codepoint = peekch() & 0x7F;
1058
}
1059
else if ((peekch() & 0b11100000) == 0b11000000)
1060
{
1061
size = 2;
1062
codepoint = peekch() & 0b11111;
1063
}
1064
else if ((peekch() & 0b11110000) == 0b11100000)
1065
{
1066
size = 3;
1067
codepoint = peekch() & 0b1111;
1068
}
1069
else if ((peekch() & 0b11111000) == 0b11110000)
1070
{
1071
size = 4;
1072
codepoint = peekch() & 0b111;
1073
}
1074
else
1075
{
1076
consume();
1077
return Lexeme(Location(start, position()), Lexeme::BrokenUnicode);
1078
}
1079
1080
consume();
1081
1082
for (int i = 1; i < size; ++i)
1083
{
1084
if ((peekch() & 0b11000000) != 0b10000000)
1085
return Lexeme(Location(start, position()), Lexeme::BrokenUnicode);
1086
1087
codepoint = codepoint << 6;
1088
codepoint |= (peekch() & 0b00111111);
1089
consume();
1090
}
1091
1092
Lexeme result(Location(start, position()), Lexeme::BrokenUnicode);
1093
result.codepoint = codepoint;
1094
return result;
1095
}
1096
1097
static size_t toUtf8(char* data, unsigned int code)
1098
{
1099
// U+0000..U+007F
1100
if (code < 0x80)
1101
{
1102
data[0] = char(code);
1103
return 1;
1104
}
1105
// U+0080..U+07FF
1106
else if (code < 0x800)
1107
{
1108
data[0] = char(0xC0 | (code >> 6));
1109
data[1] = char(0x80 | (code & 0x3F));
1110
return 2;
1111
}
1112
// U+0800..U+FFFF
1113
else if (code < 0x10000)
1114
{
1115
data[0] = char(0xE0 | (code >> 12));
1116
data[1] = char(0x80 | ((code >> 6) & 0x3F));
1117
data[2] = char(0x80 | (code & 0x3F));
1118
return 3;
1119
}
1120
// U+10000..U+10FFFF
1121
else if (code < 0x110000)
1122
{
1123
data[0] = char(0xF0 | (code >> 18));
1124
data[1] = char(0x80 | ((code >> 12) & 0x3F));
1125
data[2] = char(0x80 | ((code >> 6) & 0x3F));
1126
data[3] = char(0x80 | (code & 0x3F));
1127
return 4;
1128
}
1129
else
1130
{
1131
return 0;
1132
}
1133
}
1134
1135
bool Lexer::fixupQuotedString(std::string& data)
1136
{
1137
if (data.empty() || data.find('\\') == std::string::npos)
1138
return true;
1139
1140
size_t size = data.size();
1141
size_t write = 0;
1142
1143
for (size_t i = 0; i < size;)
1144
{
1145
if (data[i] != '\\')
1146
{
1147
data[write++] = data[i];
1148
i++;
1149
continue;
1150
}
1151
1152
if (i + 1 == size)
1153
return false;
1154
1155
char escape = data[i + 1];
1156
i += 2; // skip \e
1157
1158
switch (escape)
1159
{
1160
case '\n':
1161
data[write++] = '\n';
1162
break;
1163
1164
case '\r':
1165
data[write++] = '\n';
1166
if (i < size && data[i] == '\n')
1167
i++;
1168
break;
1169
1170
case 0:
1171
return false;
1172
1173
case 'x':
1174
{
1175
// hex escape codes are exactly 2 hex digits long
1176
if (i + 2 > size)
1177
return false;
1178
1179
unsigned int code = 0;
1180
1181
for (int j = 0; j < 2; ++j)
1182
{
1183
char ch = data[i + j];
1184
if (!isHexDigit(ch))
1185
return false;
1186
1187
// use or trick to convert to lower case
1188
code = 16 * code + (isDigit(ch) ? ch - '0' : (ch | ' ') - 'a' + 10);
1189
}
1190
1191
data[write++] = char(code);
1192
i += 2;
1193
break;
1194
}
1195
1196
case 'z':
1197
{
1198
while (i < size && isSpace(data[i]))
1199
i++;
1200
break;
1201
}
1202
1203
case 'u':
1204
{
1205
// unicode escape codes are at least 3 characters including braces
1206
if (i + 3 > size)
1207
return false;
1208
1209
if (data[i] != '{')
1210
return false;
1211
i++;
1212
1213
if (data[i] == '}')
1214
return false;
1215
1216
unsigned int code = 0;
1217
1218
for (int j = 0; j < 16; ++j)
1219
{
1220
if (i == size)
1221
return false;
1222
1223
char ch = data[i];
1224
1225
if (ch == '}')
1226
break;
1227
1228
if (!isHexDigit(ch))
1229
return false;
1230
1231
// use or trick to convert to lower case
1232
code = 16 * code + (isDigit(ch) ? ch - '0' : (ch | ' ') - 'a' + 10);
1233
i++;
1234
}
1235
1236
if (i == size || data[i] != '}')
1237
return false;
1238
i++;
1239
1240
size_t utf8 = toUtf8(&data[write], code);
1241
if (utf8 == 0)
1242
return false;
1243
1244
write += utf8;
1245
break;
1246
}
1247
1248
default:
1249
{
1250
if (isDigit(escape))
1251
{
1252
unsigned int code = escape - '0';
1253
1254
for (int j = 0; j < 2; ++j)
1255
{
1256
if (i == size || !isDigit(data[i]))
1257
break;
1258
1259
code = 10 * code + (data[i] - '0');
1260
i++;
1261
}
1262
1263
if (code > UCHAR_MAX)
1264
return false;
1265
1266
data[write++] = char(code);
1267
}
1268
else
1269
{
1270
data[write++] = unescape(escape);
1271
}
1272
}
1273
}
1274
}
1275
1276
LUAU_ASSERT(write <= size);
1277
data.resize(write);
1278
1279
return true;
1280
}
1281
1282
void Lexer::fixupMultilineString(std::string& data)
1283
{
1284
if (data.empty())
1285
return;
1286
1287
// Lua rules for multiline strings are as follows:
1288
// - standalone \r, \r\n, \n\r and \n are all considered newlines
1289
// - first newline in the multiline string is skipped
1290
// - all other newlines are normalized to \n
1291
1292
// Since our lexer just treats \n as newlines, we apply a simplified set of rules that is sufficient to get normalized newlines for Windows/Unix:
1293
// - \r\n and \n are considered newlines
1294
// - first newline is skipped
1295
// - newlines are normalized to \n
1296
1297
// This makes the string parsing behavior consistent with general lexing behavior - a standalone \r isn't considered a new line from the line
1298
// tracking perspective
1299
1300
const char* src = data.c_str();
1301
char* dst = &data[0];
1302
1303
// skip leading newline
1304
if (src[0] == '\r' && src[1] == '\n')
1305
{
1306
src += 2;
1307
}
1308
else if (src[0] == '\n')
1309
{
1310
src += 1;
1311
}
1312
1313
// parse the rest of the string, converting newlines as we go
1314
while (*src)
1315
{
1316
if (src[0] == '\r' && src[1] == '\n')
1317
{
1318
*dst++ = '\n';
1319
src += 2;
1320
}
1321
else // note, this handles \n by just writing it without changes
1322
{
1323
*dst++ = *src;
1324
src += 1;
1325
}
1326
}
1327
1328
data.resize(dst - &data[0]);
1329
}
1330
1331
} // namespace Luau
1332
1333