CoCalc -- Lexer.cpp

GitHub Repository: Roblox/luau
Path: blob/master/Ast/src/Lexer.cpp
²⁷²⁵ views
1
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
2
#include "Luau/Lexer.h"
3

4
#include "Luau/Allocator.h"
5
#include "Luau/Common.h"
6
#include "Luau/Confusables.h"
7
#include "Luau/StringUtils.h"
8

9
#include <limits.h>
10

11
namespace Luau
12
{
13

14
Lexeme::Lexeme(const Location& location, Type type)
15
    : type(type)
16
    , location(location)
17
    , length(0)
18
    , data(nullptr)
19
{
20
}
21

22
Lexeme::Lexeme(const Location& location, char character)
23
    : type(static_cast<Type>(static_cast<unsigned char>(character)))
24
    , location(location)
25
    , length(0)
26
    , data(nullptr)
27
{
28
}
29

30
Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t size)
31
    : type(type)
32
    , location(location)
33
    , length(unsigned(size))
34
    , data(data)
35
{
36
    LUAU_ASSERT(
37
        type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
38
        type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
39
    );
40
}
41

42
Lexeme::Lexeme(const Location& location, Type type, const char* name)
43
    : type(type)
44
    , location(location)
45
    , length(0)
46
    , name(name)
47
{
48
    LUAU_ASSERT(type == Name || type == Attribute || (type >= Reserved_BEGIN && type < Lexeme::Reserved_END));
49
}
50

51
unsigned int Lexeme::getLength() const
52
{
53
    LUAU_ASSERT(
54
        type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
55
        type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment
56
    );
57

58
    return length;
59
}
60

61
static const char* kReserved[] = {"and",   "break", "do",  "else", "elseif", "end",    "false", "for",  "function", "if",   "in",
62
                                  "local", "nil",   "not", "or",   "repeat", "return", "then",  "true", "until",    "while"};
63

64
std::string Lexeme::toString() const
65
{
66
    switch (type)
67
    {
68
    case Eof:
69
        return "<eof>";
70

71
    case Equal:
72
        return "'=='";
73

74
    case LessEqual:
75
        return "'<='";
76

77
    case GreaterEqual:
78
        return "'>='";
79

80
    case NotEqual:
81
        return "'~='";
82

83
    case Dot2:
84
        return "'..'";
85

86
    case Dot3:
87
        return "'...'";
88

89
    case SkinnyArrow:
90
        return "'->'";
91

92
    case DoubleColon:
93
        return "'::'";
94

95
    case FloorDiv:
96
        return "'//'";
97

98
    case AddAssign:
99
        return "'+='";
100

101
    case SubAssign:
102
        return "'-='";
103

104
    case MulAssign:
105
        return "'*='";
106

107
    case DivAssign:
108
        return "'/='";
109

110
    case FloorDivAssign:
111
        return "'//='";
112

113
    case ModAssign:
114
        return "'%='";
115

116
    case PowAssign:
117
        return "'^='";
118

119
    case ConcatAssign:
120
        return "'..='";
121

122
    case RawString:
123
    case QuotedString:
124
        return data ? format("\"%.*s\"", length, data) : "string";
125

126
    case InterpStringBegin:
127
        return data ? format("`%.*s{", length, data) : "the beginning of an interpolated string";
128

129
    case InterpStringMid:
130
        return data ? format("}%.*s{", length, data) : "the middle of an interpolated string";
131

132
    case InterpStringEnd:
133
        return data ? format("}%.*s`", length, data) : "the end of an interpolated string";
134

135
    case InterpStringSimple:
136
        return data ? format("`%.*s`", length, data) : "interpolated string";
137

138
    case Number:
139
        return data ? format("'%.*s'", length, data) : "number";
140

141
    case Name:
142
        return name ? format("'%s'", name) : "identifier";
143

144
    case Comment:
145
        return "comment";
146

147
    case Attribute:
148
        return name ? format("'%s'", name) : "attribute";
149

150
    case AttributeOpen:
151
        return "'@['";
152

153
    case BrokenString:
154
        return "malformed string";
155

156
    case BrokenComment:
157
        return "unfinished comment";
158

159
    case BrokenInterpDoubleBrace:
160
        return "'{{', which is invalid (did you mean '\\{'?)";
161

162
    case BrokenUnicode:
163
        if (codepoint)
164
        {
165
            if (const char* confusable = findConfusable(codepoint))
166
                return format("Unicode character U+%x (did you mean '%s'?)", codepoint, confusable);
167

168
            return format("Unicode character U+%x", codepoint);
169
        }
170
        else
171
        {
172
            return "invalid UTF-8 sequence";
173
        }
174

175
    default:
176
        if (type < Char_END)
177
            return format("'%c'", type);
178
        else if (type >= Reserved_BEGIN && type < Reserved_END)
179
            return format("'%s'", kReserved[type - Reserved_BEGIN]);
180
        else
181
            return "<unknown>";
182
    }
183
}
184

185
bool AstNameTable::Entry::operator==(const Entry& other) const
186
{
187
    return length == other.length && memcmp(value.value, other.value.value, length) == 0;
188
}
189

190
size_t AstNameTable::EntryHash::operator()(const Entry& e) const
191
{
192
    // FNV1a
193
    uint32_t hash = 2166136261;
194

195
    for (size_t i = 0; i < e.length; ++i)
196
    {
197
        hash ^= uint8_t(e.value.value[i]);
198
        hash *= 16777619;
199
    }
200

201
    return hash;
202
}
203

204
AstNameTable::AstNameTable(Allocator& allocator)
205
    : data({AstName(""), 0, Lexeme::Eof}, 128)
206
    , allocator(allocator)
207
{
208
    static_assert(sizeof(kReserved) / sizeof(kReserved[0]) == Lexeme::Reserved_END - Lexeme::Reserved_BEGIN);
209

210
    for (int i = Lexeme::Reserved_BEGIN; i < Lexeme::Reserved_END; ++i)
211
        addStatic(kReserved[i - Lexeme::Reserved_BEGIN], static_cast<Lexeme::Type>(i));
212
}
213

214
AstName AstNameTable::addStatic(const char* name, Lexeme::Type type)
215
{
216
    AstNameTable::Entry entry = {AstName(name), uint32_t(strlen(name)), type};
217

218
    LUAU_ASSERT(!data.contains(entry));
219
    data.insert(entry);
220

221
    return entry.value;
222
}
223

224
std::pair<AstName, Lexeme::Type> AstNameTable::getOrAddWithType(const char* name, size_t length)
225
{
226
    AstNameTable::Entry key = {AstName(name), uint32_t(length), Lexeme::Eof};
227
    const Entry& entry = data.insert(key);
228

229
    // entry already was inserted
230
    if (entry.type != Lexeme::Eof)
231
        return std::make_pair(entry.value, entry.type);
232

233
    // we just inserted an entry with a non-owned pointer into the map
234
    // we need to correct it, *but* we need to be careful about not disturbing the hash value
235
    char* nameData = static_cast<char*>(allocator.allocate(length + 1));
236
    memcpy(nameData, name, length);
237
    nameData[length] = 0;
238

239
    const_cast<Entry&>(entry).value = AstName(nameData);
240
    const_cast<Entry&>(entry).type = (name[0] == '@' ? Lexeme::Attribute : Lexeme::Name);
241

242
    return std::make_pair(entry.value, entry.type);
243
}
244

245
std::pair<AstName, Lexeme::Type> AstNameTable::getWithType(const char* name, size_t length) const
246
{
247
    if (const Entry* entry = data.find({AstName(name), uint32_t(length), Lexeme::Eof}))
248
    {
249
        return std::make_pair(entry->value, entry->type);
250
    }
251
    return std::make_pair(AstName(), Lexeme::Name);
252
}
253

254
AstName AstNameTable::getOrAdd(const char* name, size_t len)
255
{
256
    return getOrAddWithType(name, len).first;
257
}
258

259
AstName AstNameTable::getOrAdd(const char* name)
260
{
261
    return getOrAddWithType(name, strlen(name)).first;
262
}
263

264
AstName AstNameTable::get(const char* name) const
265
{
266
    return getWithType(name, strlen(name)).first;
267
}
268

269
inline bool isAlpha(char ch)
270
{
271
    // use or trick to convert to lower case and unsigned comparison to do range check
272
    return unsigned((ch | ' ') - 'a') < 26;
273
}
274

275
inline bool isDigit(char ch)
276
{
277
    return unsigned(ch - '0') < 10;
278
}
279

280
inline bool isHexDigit(char ch)
281
{
282
    // use or trick to convert to lower case and unsigned comparison to do range check
283
    return unsigned(ch - '0') < 10 || unsigned((ch | ' ') - 'a') < 6;
284
}
285

286
inline bool isNewline(char ch)
287
{
288
    return ch == '\n';
289
}
290

291
static char unescape(char ch)
292
{
293
    switch (ch)
294
    {
295
    case 'a':
296
        return '\a';
297
    case 'b':
298
        return '\b';
299
    case 'f':
300
        return '\f';
301
    case 'n':
302
        return '\n';
303
    case 'r':
304
        return '\r';
305
    case 't':
306
        return '\t';
307
    case 'v':
308
        return '\v';
309
    default:
310
        return ch;
311
    }
312
}
313

314
unsigned int Lexeme::getBlockDepth() const
315
{
316
    LUAU_ASSERT(type == Lexeme::RawString || type == Lexeme::BlockComment);
317

318
    // If we have a well-formed string, we are guaranteed to see 2 `]` characters after the end of the string contents
319
    LUAU_ASSERT(*(data + length) == ']');
320
    unsigned int depth = 0;
321
    do
322
    {
323
        depth++;
324
    } while (*(data + length + depth) != ']');
325

326
    return depth - 1;
327
}
328

329
Lexeme::QuoteStyle Lexeme::getQuoteStyle() const
330
{
331
    LUAU_ASSERT(type == Lexeme::QuotedString);
332

333
    // If we have a well-formed string, we are guaranteed to see a closing delimiter after the string
334
    LUAU_ASSERT(data);
335

336
    char quote = *(data + length);
337
    if (quote == '\'')
338
        return Lexeme::QuoteStyle::Single;
339
    else if (quote == '"')
340
        return Lexeme::QuoteStyle::Double;
341

342
    LUAU_ASSERT(!"Unknown quote style");
343
    return Lexeme::QuoteStyle::Double; // unreachable, but required due to compiler warning
344
}
345

346
Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names, Position startPosition)
347
    : buffer(buffer)
348
    , bufferSize(bufferSize)
349
    , offset(0)
350
    , line(startPosition.line)
351
    , lineOffset(0u - startPosition.column)
352
    , lexeme((Location(Position(startPosition.line, startPosition.column), 0)), Lexeme::Eof)
353
    , names(names)
354
    , skipComments(false)
355
    , readNames(true)
356
{
357
}
358

359
void Lexer::setSkipComments(bool skip)
360
{
361
    skipComments = skip;
362
}
363

364
void Lexer::setReadNames(bool read)
365
{
366
    readNames = read;
367
}
368

369
const Lexeme& Lexer::next()
370
{
371
    return next(this->skipComments, true);
372
}
373

374
const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation)
375
{
376
    // in skipComments mode we reject valid comments
377
    do
378
    {
379
        // consume whitespace before the token
380
        while (isSpace(peekch()))
381
            consumeAny();
382

383
        if (updatePrevLocation)
384
            prevLocation = lexeme.location;
385

386
        lexeme = readNext();
387
        updatePrevLocation = false;
388
    } while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment));
389

390
    return lexeme;
391
}
392

393
void Lexer::nextline()
394
{
395
    while (peekch() != 0 && peekch() != '\r' && !isNewline(peekch()))
396
        consume();
397

398
    next();
399
}
400

401
Lexeme Lexer::lookahead()
402
{
403
    unsigned int currentOffset = offset;
404
    unsigned int currentLine = line;
405
    unsigned int currentLineOffset = lineOffset;
406
    Lexeme currentLexeme = lexeme;
407
    Location currentPrevLocation = prevLocation;
408
    size_t currentBraceStackSize = braceStack.size();
409
    BraceType currentBraceType = braceStack.empty() ? BraceType::Normal : braceStack.back();
410

411
    Lexeme result = next();
412

413
    offset = currentOffset;
414
    line = currentLine;
415
    lineOffset = currentLineOffset;
416
    lexeme = currentLexeme;
417
    prevLocation = currentPrevLocation;
418

419
    if (braceStack.size() < currentBraceStackSize)
420
        braceStack.push_back(currentBraceType);
421
    else if (braceStack.size() > currentBraceStackSize)
422
        braceStack.pop_back();
423

424
    return result;
425
}
426

427
bool Lexer::isReserved(const std::string& word)
428
{
429
    for (int i = Lexeme::Reserved_BEGIN; i < Lexeme::Reserved_END; ++i)
430
        if (word == kReserved[i - Lexeme::Reserved_BEGIN])
431
            return true;
432

433
    return false;
434
}
435

436
LUAU_FORCEINLINE
437
char Lexer::peekch() const
438
{
439
    return (offset < bufferSize) ? buffer[offset] : 0;
440
}
441

442
LUAU_FORCEINLINE
443
char Lexer::peekch(unsigned int lookahead) const
444
{
445
    return (offset + lookahead < bufferSize) ? buffer[offset + lookahead] : 0;
446
}
447

448
LUAU_FORCEINLINE
449
Position Lexer::position() const
450
{
451
    return Position(line, offset - lineOffset);
452
}
453

454
LUAU_FORCEINLINE
455
void Lexer::consume()
456
{
457
    // consume() assumes current character is known to not be a newline; use consumeAny if this is not guaranteed
458
    LUAU_ASSERT(!isNewline(buffer[offset]));
459

460
    offset++;
461
}
462

463
LUAU_FORCEINLINE
464
void Lexer::consumeAny()
465
{
466
    if (isNewline(buffer[offset]))
467
    {
468
        line++;
469
        lineOffset = offset + 1;
470
    }
471

472
    offset++;
473
}
474

475
Lexeme Lexer::readCommentBody()
476
{
477
    Position start = position();
478

479
    LUAU_ASSERT(peekch(0) == '-' && peekch(1) == '-');
480
    consume();
481
    consume();
482

483
    size_t startOffset = offset;
484

485
    if (peekch() == '[')
486
    {
487
        int sep = skipLongSeparator();
488

489
        if (sep >= 0)
490
        {
491
            return readLongString(start, sep, Lexeme::BlockComment, Lexeme::BrokenComment);
492
        }
493
    }
494

495
    // fall back to single-line comment
496
    while (peekch() != 0 && peekch() != '\r' && !isNewline(peekch()))
497
        consume();
498

499
    return Lexeme(Location(start, position()), Lexeme::Comment, &buffer[startOffset], offset - startOffset);
500
}
501

502
// Given a sequence [===[ or ]===], returns:
503
// 1. number of equal signs (or 0 if none present) between the brackets
504
// 2. -1 if this is not a long comment/string separator
505
// 3. -N if this is a malformed separator
506
// Does *not* consume the closing brace.
507
int Lexer::skipLongSeparator()
508
{
509
    char start = peekch();
510

511
    LUAU_ASSERT(start == '[' || start == ']');
512
    consume();
513

514
    int count = 0;
515

516
    while (peekch() == '=')
517
    {
518
        consume();
519
        count++;
520
    }
521

522
    return (start == peekch()) ? count : (-count) - 1;
523
}
524

525
Lexeme Lexer::readLongString(const Position& start, int sep, Lexeme::Type ok, Lexeme::Type broken)
526
{
527
    // skip (second) [
528
    LUAU_ASSERT(peekch() == '[');
529
    consume();
530

531
    unsigned int startOffset = offset;
532

533
    while (peekch())
534
    {
535
        if (peekch() == ']')
536
        {
537
            if (skipLongSeparator() == sep)
538
            {
539
                LUAU_ASSERT(peekch() == ']');
540
                consume(); // skip (second) ]
541

542
                unsigned int endOffset = offset - sep - 2;
543
                LUAU_ASSERT(endOffset >= startOffset);
544

545
                return Lexeme(Location(start, position()), ok, &buffer[startOffset], endOffset - startOffset);
546
            }
547
        }
548
        else
549
        {
550
            consumeAny();
551
        }
552
    }
553

554
    return Lexeme(Location(start, position()), broken);
555
}
556

557
void Lexer::readBackslashInString()
558
{
559
    LUAU_ASSERT(peekch() == '\\');
560
    consume();
561
    switch (peekch())
562
    {
563
    case '\r':
564
        consume();
565
        if (peekch() == '\n')
566
            consumeAny();
567
        break;
568

569
    case 0:
570
        break;
571

572
    case 'z':
573
        consume();
574
        while (isSpace(peekch()))
575
            consumeAny();
576
        break;
577

578
    default:
579
        consumeAny();
580
    }
581
}
582

583
Lexeme Lexer::readQuotedString()
584
{
585
    Position start = position();
586

587
    char delimiter = peekch();
588
    LUAU_ASSERT(delimiter == '\'' || delimiter == '"');
589
    consume();
590

591
    unsigned int startOffset = offset;
592

593
    while (peekch() != delimiter)
594
    {
595
        switch (peekch())
596
        {
597
        case 0:
598
        case '\r':
599
        case '\n':
600
            return Lexeme(Location(start, position()), Lexeme::BrokenString);
601

602
        case '\\':
603
            readBackslashInString();
604
            break;
605

606
        default:
607
            consume();
608
        }
609
    }
610

611
    consume();
612

613
    return Lexeme(Location(start, position()), Lexeme::QuotedString, &buffer[startOffset], offset - startOffset - 1);
614
}
615

616
Lexeme Lexer::readInterpolatedStringBegin()
617
{
618
    LUAU_ASSERT(peekch() == '`');
619

620
    Position start = position();
621
    consume();
622

623
    return readInterpolatedStringSection(start, Lexeme::InterpStringBegin, Lexeme::InterpStringSimple);
624
}
625

626
Lexeme Lexer::readInterpolatedStringSection(Position start, Lexeme::Type formatType, Lexeme::Type endType)
627
{
628
    unsigned int startOffset = offset;
629

630
    while (peekch() != '`')
631
    {
632
        switch (peekch())
633
        {
634
        case 0:
635
        case '\r':
636
        case '\n':
637
            return Lexeme(Location(start, position()), Lexeme::BrokenString);
638

639
        case '\\':
640
            // Allow for \u{}, which would otherwise be consumed by looking for {
641
            if (peekch(1) == 'u' && peekch(2) == '{')
642
            {
643
                consume(); // backslash
644
                consume(); // u
645
                consume(); // {
646
                break;
647
            }
648

649
            readBackslashInString();
650
            break;
651

652
        case '{':
653
        {
654
            braceStack.push_back(BraceType::InterpolatedString);
655

656
            if (peekch(1) == '{')
657
            {
658
                Lexeme brokenDoubleBrace =
659
                    Lexeme(Location(start, position()), Lexeme::BrokenInterpDoubleBrace, &buffer[startOffset], offset - startOffset);
660
                consume();
661
                consume();
662
                return brokenDoubleBrace;
663
            }
664

665
            consume();
666
            return Lexeme(Location(start, position()), formatType, &buffer[startOffset], offset - startOffset - 1);
667
        }
668

669
        default:
670
            consume();
671
        }
672
    }
673

674
    consume();
675

676
    return Lexeme(Location(start, position()), endType, &buffer[startOffset], offset - startOffset - 1);
677
}
678

679
Lexeme Lexer::readNumber(const Position& start, unsigned int startOffset)
680
{
681
    LUAU_ASSERT(isDigit(peekch()));
682

683
    // This function does not do the number parsing - it only skips a number-like pattern.
684
    // It uses the same logic as Lua stock lexer; the resulting string is later converted
685
    // to a number with proper verification.
686
    do
687
    {
688
        consume();
689
    } while (isDigit(peekch()) || peekch() == '.' || peekch() == '_');
690

691
    if (peekch() == 'e' || peekch() == 'E')
692
    {
693
        consume();
694

695
        if (peekch() == '+' || peekch() == '-')
696
            consume();
697
    }
698

699
    while (isAlpha(peekch()) || isDigit(peekch()) || peekch() == '_')
700
        consume();
701

702
    return Lexeme(Location(start, position()), Lexeme::Number, &buffer[startOffset], offset - startOffset);
703
}
704

705
std::pair<AstName, Lexeme::Type> Lexer::readName()
706
{
707
    LUAU_ASSERT(isAlpha(peekch()) || peekch() == '_' || peekch() == '@');
708

709
    unsigned int startOffset = offset;
710

711
    do
712
        consume();
713
    while (isAlpha(peekch()) || isDigit(peekch()) || peekch() == '_');
714

715
    return readNames ? names.getOrAddWithType(&buffer[startOffset], offset - startOffset)
716
                     : names.getWithType(&buffer[startOffset], offset - startOffset);
717
}
718

719
Lexeme Lexer::readNext()
720
{
721
    Position start = position();
722

723
    switch (peekch())
724
    {
725
    case 0:
726
        return Lexeme(Location(start, 0), Lexeme::Eof);
727

728
    case '-':
729
    {
730
        if (peekch(1) == '>')
731
        {
732
            consume();
733
            consume();
734
            return Lexeme(Location(start, 2), Lexeme::SkinnyArrow);
735
        }
736
        else if (peekch(1) == '=')
737
        {
738
            consume();
739
            consume();
740
            return Lexeme(Location(start, 2), Lexeme::SubAssign);
741
        }
742
        else if (peekch(1) == '-')
743
        {
744
            return readCommentBody();
745
        }
746
        else
747
        {
748
            consume();
749
            return Lexeme(Location(start, 1), '-');
750
        }
751
    }
752

753
    case '[':
754
    {
755
        int sep = skipLongSeparator();
756

757
        if (sep >= 0)
758
        {
759
            return readLongString(start, sep, Lexeme::RawString, Lexeme::BrokenString);
760
        }
761
        else if (sep == -1)
762
        {
763
            return Lexeme(Location(start, 1), '[');
764
        }
765
        else
766
        {
767
            return Lexeme(Location(start, position()), Lexeme::BrokenString);
768
        }
769
    }
770

771
    case '{':
772
    {
773
        consume();
774

775
        if (!braceStack.empty())
776
            braceStack.push_back(BraceType::Normal);
777

778
        return Lexeme(Location(start, 1), '{');
779
    }
780

781
    case '}':
782
    {
783
        consume();
784

785
        if (braceStack.empty())
786
        {
787
            return Lexeme(Location(start, 1), '}');
788
        }
789

790
        const BraceType braceStackTop = braceStack.back();
791
        braceStack.pop_back();
792

793
        if (braceStackTop != BraceType::InterpolatedString)
794
        {
795
            return Lexeme(Location(start, 1), '}');
796
        }
797

798
        return readInterpolatedStringSection(start, Lexeme::InterpStringMid, Lexeme::InterpStringEnd);
799
    }
800

801
    case '=':
802
    {
803
        consume();
804

805
        if (peekch() == '=')
806
        {
807
            consume();
808
            return Lexeme(Location(start, 2), Lexeme::Equal);
809
        }
810
        else
811
            return Lexeme(Location(start, 1), '=');
812
    }
813

814
    case '<':
815
    {
816
        consume();
817

818
        if (peekch() == '=')
819
        {
820
            consume();
821
            return Lexeme(Location(start, 2), Lexeme::LessEqual);
822
        }
823
        else
824
            return Lexeme(Location(start, 1), '<');
825
    }
826

827
    case '>':
828
    {
829
        consume();
830

831
        if (peekch() == '=')
832
        {
833
            consume();
834
            return Lexeme(Location(start, 2), Lexeme::GreaterEqual);
835
        }
836
        else
837
            return Lexeme(Location(start, 1), '>');
838
    }
839

840
    case '~':
841
    {
842
        consume();
843

844
        if (peekch() == '=')
845
        {
846
            consume();
847
            return Lexeme(Location(start, 2), Lexeme::NotEqual);
848
        }
849
        else
850
            return Lexeme(Location(start, 1), '~');
851
    }
852

853
    case '"':
854
    case '\'':
855
        return readQuotedString();
856

857
    case '`':
858
        return readInterpolatedStringBegin();
859

860
    case '.':
861
        consume();
862

863
        if (peekch() == '.')
864
        {
865
            consume();
866

867
            if (peekch() == '.')
868
            {
869
                consume();
870

871
                return Lexeme(Location(start, 3), Lexeme::Dot3);
872
            }
873
            else if (peekch() == '=')
874
            {
875
                consume();
876

877
                return Lexeme(Location(start, 3), Lexeme::ConcatAssign);
878
            }
879
            else
880
                return Lexeme(Location(start, 2), Lexeme::Dot2);
881
        }
882
        else
883
        {
884
            if (isDigit(peekch()))
885
            {
886
                return readNumber(start, offset - 1);
887
            }
888
            else
889
                return Lexeme(Location(start, 1), '.');
890
        }
891

892
    case '+':
893
        consume();
894

895
        if (peekch() == '=')
896
        {
897
            consume();
898
            return Lexeme(Location(start, 2), Lexeme::AddAssign);
899
        }
900
        else
901
            return Lexeme(Location(start, 1), '+');
902

903
    case '/':
904
    {
905
        consume();
906

907
        char ch = peekch();
908

909
        if (ch == '=')
910
        {
911
            consume();
912
            return Lexeme(Location(start, 2), Lexeme::DivAssign);
913
        }
914
        else if (ch == '/')
915
        {
916
            consume();
917

918
            if (peekch() == '=')
919
            {
920
                consume();
921
                return Lexeme(Location(start, 3), Lexeme::FloorDivAssign);
922
            }
923
            else
924
                return Lexeme(Location(start, 2), Lexeme::FloorDiv);
925
        }
926
        else
927
            return Lexeme(Location(start, 1), '/');
928
    }
929

930
    case '*':
931
        consume();
932

933
        if (peekch() == '=')
934
        {
935
            consume();
936
            return Lexeme(Location(start, 2), Lexeme::MulAssign);
937
        }
938
        else
939
            return Lexeme(Location(start, 1), '*');
940

941
    case '%':
942
        consume();
943

944
        if (peekch() == '=')
945
        {
946
            consume();
947
            return Lexeme(Location(start, 2), Lexeme::ModAssign);
948
        }
949
        else
950
            return Lexeme(Location(start, 1), '%');
951

952
    case '^':
953
        consume();
954

955
        if (peekch() == '=')
956
        {
957
            consume();
958
            return Lexeme(Location(start, 2), Lexeme::PowAssign);
959
        }
960
        else
961
            return Lexeme(Location(start, 1), '^');
962

963
    case ':':
964
    {
965
        consume();
966
        if (peekch() == ':')
967
        {
968
            consume();
969
            return Lexeme(Location(start, 2), Lexeme::DoubleColon);
970
        }
971
        else
972
            return Lexeme(Location(start, 1), ':');
973
    }
974

975
    case '(':
976
    case ')':
977
    case ']':
978
    case ';':
979
    case ',':
980
    case '#':
981
    case '?':
982
    case '&':
983
    case '|':
984
    {
985
        char ch = peekch();
986
        consume();
987

988
        return Lexeme(Location(start, 1), ch);
989
    }
990
    case '@':
991
    {
992
        if (peekch(1) == '[')
993
        {
994
            consume();
995
            consume();
996

997
            return Lexeme(Location(start, 2), Lexeme::AttributeOpen);
998
        }
999
        else
1000
        {
1001
            // consume @ first
1002
            consume();
1003

1004
            if (isAlpha(peekch()) || peekch() == '_')
1005
            {
1006
                std::pair<AstName, Lexeme::Type> attribute = readName();
1007
                return Lexeme(Location(start, position()), Lexeme::Attribute, attribute.first.value);
1008
            }
1009
            else
1010
            {
1011
                return Lexeme(Location(start, position()), Lexeme::Attribute, "");
1012
            }
1013
        }
1014
    }
1015
    default:
1016
        if (isDigit(peekch()))
1017
        {
1018
            return readNumber(start, offset);
1019
        }
1020
        else if (isAlpha(peekch()) || peekch() == '_')
1021
        {
1022
            std::pair<AstName, Lexeme::Type> name = readName();
1023

1024
            return Lexeme(Location(start, position()), name.second, name.first.value);
1025
        }
1026
        else if (peekch() & 0x80)
1027
        {
1028
            return readUtf8Error();
1029
        }
1030
        else
1031
        {
1032
            char ch = peekch();
1033
            consume();
1034

1035
            return Lexeme(Location(start, 1), ch);
1036
        }
1037
    }
1038
}
1039

1040
std::optional<Lexer::BraceType> Lexer::peekBraceStackTop()
1041
{
1042
    if (braceStack.empty())
1043
        return std::nullopt;
1044
    else
1045
        return {braceStack.back()};
1046
}
1047

1048
LUAU_NOINLINE Lexeme Lexer::readUtf8Error()
1049
{
1050
    Position start = position();
1051
    uint32_t codepoint = 0;
1052
    int size = 0;
1053

1054
    if ((peekch() & 0b10000000) == 0b00000000)
1055
    {
1056
        size = 1;
1057
        codepoint = peekch() & 0x7F;
1058
    }
1059
    else if ((peekch() & 0b11100000) == 0b11000000)
1060
    {
1061
        size = 2;
1062
        codepoint = peekch() & 0b11111;
1063
    }
1064
    else if ((peekch() & 0b11110000) == 0b11100000)
1065
    {
1066
        size = 3;
1067
        codepoint = peekch() & 0b1111;
1068
    }
1069
    else if ((peekch() & 0b11111000) == 0b11110000)
1070
    {
1071
        size = 4;
1072
        codepoint = peekch() & 0b111;
1073
    }
1074
    else
1075
    {
1076
        consume();
1077
        return Lexeme(Location(start, position()), Lexeme::BrokenUnicode);
1078
    }
1079

1080
    consume();
1081

1082
    for (int i = 1; i < size; ++i)
1083
    {
1084
        if ((peekch() & 0b11000000) != 0b10000000)
1085
            return Lexeme(Location(start, position()), Lexeme::BrokenUnicode);
1086

1087
        codepoint = codepoint << 6;
1088
        codepoint |= (peekch() & 0b00111111);
1089
        consume();
1090
    }
1091

1092
    Lexeme result(Location(start, position()), Lexeme::BrokenUnicode);
1093
    result.codepoint = codepoint;
1094
    return result;
1095
}
1096

1097
static size_t toUtf8(char* data, unsigned int code)
1098
{
1099
    // U+0000..U+007F
1100
    if (code < 0x80)
1101
    {
1102
        data[0] = char(code);
1103
        return 1;
1104
    }
1105
    // U+0080..U+07FF
1106
    else if (code < 0x800)
1107
    {
1108
        data[0] = char(0xC0 | (code >> 6));
1109
        data[1] = char(0x80 | (code & 0x3F));
1110
        return 2;
1111
    }
1112
    // U+0800..U+FFFF
1113
    else if (code < 0x10000)
1114
    {
1115
        data[0] = char(0xE0 | (code >> 12));
1116
        data[1] = char(0x80 | ((code >> 6) & 0x3F));
1117
        data[2] = char(0x80 | (code & 0x3F));
1118
        return 3;
1119
    }
1120
    // U+10000..U+10FFFF
1121
    else if (code < 0x110000)
1122
    {
1123
        data[0] = char(0xF0 | (code >> 18));
1124
        data[1] = char(0x80 | ((code >> 12) & 0x3F));
1125
        data[2] = char(0x80 | ((code >> 6) & 0x3F));
1126
        data[3] = char(0x80 | (code & 0x3F));
1127
        return 4;
1128
    }
1129
    else
1130
    {
1131
        return 0;
1132
    }
1133
}
1134

1135
bool Lexer::fixupQuotedString(std::string& data)
1136
{
1137
    if (data.empty() || data.find('\\') == std::string::npos)
1138
        return true;
1139

1140
    size_t size = data.size();
1141
    size_t write = 0;
1142

1143
    for (size_t i = 0; i < size;)
1144
    {
1145
        if (data[i] != '\\')
1146
        {
1147
            data[write++] = data[i];
1148
            i++;
1149
            continue;
1150
        }
1151

1152
        if (i + 1 == size)
1153
            return false;
1154

1155
        char escape = data[i + 1];
1156
        i += 2; // skip \e
1157

1158
        switch (escape)
1159
        {
1160
        case '\n':
1161
            data[write++] = '\n';
1162
            break;
1163

1164
        case '\r':
1165
            data[write++] = '\n';
1166
            if (i < size && data[i] == '\n')
1167
                i++;
1168
            break;
1169

1170
        case 0:
1171
            return false;
1172

1173
        case 'x':
1174
        {
1175
            // hex escape codes are exactly 2 hex digits long
1176
            if (i + 2 > size)
1177
                return false;
1178

1179
            unsigned int code = 0;
1180

1181
            for (int j = 0; j < 2; ++j)
1182
            {
1183
                char ch = data[i + j];
1184
                if (!isHexDigit(ch))
1185
                    return false;
1186

1187
                // use or trick to convert to lower case
1188
                code = 16 * code + (isDigit(ch) ? ch - '0' : (ch | ' ') - 'a' + 10);
1189
            }
1190

1191
            data[write++] = char(code);
1192
            i += 2;
1193
            break;
1194
        }
1195

1196
        case 'z':
1197
        {
1198
            while (i < size && isSpace(data[i]))
1199
                i++;
1200
            break;
1201
        }
1202

1203
        case 'u':
1204
        {
1205
            // unicode escape codes are at least 3 characters including braces
1206
            if (i + 3 > size)
1207
                return false;
1208

1209
            if (data[i] != '{')
1210
                return false;
1211
            i++;
1212

1213
            if (data[i] == '}')
1214
                return false;
1215

1216
            unsigned int code = 0;
1217

1218
            for (int j = 0; j < 16; ++j)
1219
            {
1220
                if (i == size)
1221
                    return false;
1222

1223
                char ch = data[i];
1224

1225
                if (ch == '}')
1226
                    break;
1227

1228
                if (!isHexDigit(ch))
1229
                    return false;
1230

1231
                // use or trick to convert to lower case
1232
                code = 16 * code + (isDigit(ch) ? ch - '0' : (ch | ' ') - 'a' + 10);
1233
                i++;
1234
            }
1235

1236
            if (i == size || data[i] != '}')
1237
                return false;
1238
            i++;
1239

1240
            size_t utf8 = toUtf8(&data[write], code);
1241
            if (utf8 == 0)
1242
                return false;
1243

1244
            write += utf8;
1245
            break;
1246
        }
1247

1248
        default:
1249
        {
1250
            if (isDigit(escape))
1251
            {
1252
                unsigned int code = escape - '0';
1253

1254
                for (int j = 0; j < 2; ++j)
1255
                {
1256
                    if (i == size || !isDigit(data[i]))
1257
                        break;
1258

1259
                    code = 10 * code + (data[i] - '0');
1260
                    i++;
1261
                }
1262

1263
                if (code > UCHAR_MAX)
1264
                    return false;
1265

1266
                data[write++] = char(code);
1267
            }
1268
            else
1269
            {
1270
                data[write++] = unescape(escape);
1271
            }
1272
        }
1273
        }
1274
    }
1275

1276
    LUAU_ASSERT(write <= size);
1277
    data.resize(write);
1278

1279
    return true;
1280
}
1281

1282
void Lexer::fixupMultilineString(std::string& data)
1283
{
1284
    if (data.empty())
1285
        return;
1286

1287
    // Lua rules for multiline strings are as follows:
1288
    // - standalone \r, \r\n, \n\r and \n are all considered newlines
1289
    // - first newline in the multiline string is skipped
1290
    // - all other newlines are normalized to \n
1291

1292
    // Since our lexer just treats \n as newlines, we apply a simplified set of rules that is sufficient to get normalized newlines for Windows/Unix:
1293
    // - \r\n and \n are considered newlines
1294
    // - first newline is skipped
1295
    // - newlines are normalized to \n
1296

1297
    // This makes the string parsing behavior consistent with general lexing behavior - a standalone \r isn't considered a new line from the line
1298
    // tracking perspective
1299

1300
    const char* src = data.c_str();
1301
    char* dst = &data[0];
1302

1303
    // skip leading newline
1304
    if (src[0] == '\r' && src[1] == '\n')
1305
    {
1306
        src += 2;
1307
    }
1308
    else if (src[0] == '\n')
1309
    {
1310
        src += 1;
1311
    }
1312

1313
    // parse the rest of the string, converting newlines as we go
1314
    while (*src)
1315
    {
1316
        if (src[0] == '\r' && src[1] == '\n')
1317
        {
1318
            *dst++ = '\n';
1319
            src += 2;
1320
        }
1321
        else // note, this handles \n by just writing it without changes
1322
        {
1323
            *dst++ = *src;
1324
            src += 1;
1325
        }
1326
    }
1327

1328
    data.resize(dst - &data[0]);
1329
}
1330

1331
} // namespace Luau
1332

1333
Product

Resources

Company