Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/llvm/lib/MC/MCParser/AsmLexer.cpp
35269 views
1
//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This class implements the lexer for assembly files.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "llvm/MC/MCParser/AsmLexer.h"
14
#include "llvm/ADT/APInt.h"
15
#include "llvm/ADT/ArrayRef.h"
16
#include "llvm/ADT/StringExtras.h"
17
#include "llvm/ADT/StringRef.h"
18
#include "llvm/ADT/StringSwitch.h"
19
#include "llvm/MC/MCAsmInfo.h"
20
#include "llvm/MC/MCParser/MCAsmLexer.h"
21
#include "llvm/Support/Compiler.h"
22
#include "llvm/Support/SMLoc.h"
23
#include "llvm/Support/SaveAndRestore.h"
24
#include <cassert>
25
#include <cctype>
26
#include <cstdio>
27
#include <cstring>
28
#include <string>
29
#include <tuple>
30
#include <utility>
31
32
using namespace llvm;
33
34
AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35
AllowAtInIdentifier = !StringRef(MAI.getCommentString()).starts_with("@");
36
LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37
}
38
39
AsmLexer::~AsmLexer() = default;
40
41
void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42
bool EndStatementAtEOF) {
43
CurBuf = Buf;
44
45
if (ptr)
46
CurPtr = ptr;
47
else
48
CurPtr = CurBuf.begin();
49
50
TokStart = nullptr;
51
this->EndStatementAtEOF = EndStatementAtEOF;
52
}
53
54
/// ReturnError - Set the error to the specified string at the specified
55
/// location. This is defined to always return AsmToken::Error.
56
AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57
SetError(SMLoc::getFromPointer(Loc), Msg);
58
59
return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60
}
61
62
int AsmLexer::getNextChar() {
63
if (CurPtr == CurBuf.end())
64
return EOF;
65
return (unsigned char)*CurPtr++;
66
}
67
68
int AsmLexer::peekNextChar() {
69
if (CurPtr == CurBuf.end())
70
return EOF;
71
return (unsigned char)*CurPtr;
72
}
73
74
/// The leading integral digit sequence and dot should have already been
75
/// consumed, some or all of the fractional digit sequence *can* have been
76
/// consumed.
77
AsmToken AsmLexer::LexFloatLiteral() {
78
// Skip the fractional digit sequence.
79
while (isDigit(*CurPtr))
80
++CurPtr;
81
82
if (*CurPtr == '-' || *CurPtr == '+')
83
return ReturnError(CurPtr, "invalid sign in float literal");
84
85
// Check for exponent
86
if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87
++CurPtr;
88
89
if (*CurPtr == '-' || *CurPtr == '+')
90
++CurPtr;
91
92
while (isDigit(*CurPtr))
93
++CurPtr;
94
}
95
96
return AsmToken(AsmToken::Real,
97
StringRef(TokStart, CurPtr - TokStart));
98
}
99
100
/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101
/// while making sure there are enough actual digits around for the constant to
102
/// be valid.
103
///
104
/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105
/// before we get here.
106
AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107
assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108
"unexpected parse state in floating hex");
109
bool NoFracDigits = true;
110
111
// Skip the fractional part if there is one
112
if (*CurPtr == '.') {
113
++CurPtr;
114
115
const char *FracStart = CurPtr;
116
while (isHexDigit(*CurPtr))
117
++CurPtr;
118
119
NoFracDigits = CurPtr == FracStart;
120
}
121
122
if (NoIntDigits && NoFracDigits)
123
return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124
"expected at least one significand digit");
125
126
// Make sure we do have some kind of proper exponent part
127
if (*CurPtr != 'p' && *CurPtr != 'P')
128
return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129
"expected exponent part 'p'");
130
++CurPtr;
131
132
if (*CurPtr == '+' || *CurPtr == '-')
133
++CurPtr;
134
135
// N.b. exponent digits are *not* hex
136
const char *ExpStart = CurPtr;
137
while (isDigit(*CurPtr))
138
++CurPtr;
139
140
if (CurPtr == ExpStart)
141
return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142
"expected at least one exponent digit");
143
144
return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145
}
146
147
/// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148
static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149
return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150
(AllowAt && C == '@') || (AllowHash && C == '#');
151
}
152
153
AsmToken AsmLexer::LexIdentifier() {
154
// Check for floating point literals.
155
if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156
// Disambiguate a .1243foo identifier from a floating literal.
157
while (isDigit(*CurPtr))
158
++CurPtr;
159
160
if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
161
AllowHashInIdentifier) ||
162
*CurPtr == 'e' || *CurPtr == 'E')
163
return LexFloatLiteral();
164
}
165
166
while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
167
++CurPtr;
168
169
// Handle . as a special case.
170
if (CurPtr == TokStart+1 && TokStart[0] == '.')
171
return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172
173
return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174
}
175
176
/// LexSlash: Slash: /
177
/// C-Style Comment: /* ... */
178
/// C-style Comment: // ...
179
AsmToken AsmLexer::LexSlash() {
180
if (!MAI.shouldAllowAdditionalComments()) {
181
IsAtStartOfStatement = false;
182
return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183
}
184
185
switch (*CurPtr) {
186
case '*':
187
IsAtStartOfStatement = false;
188
break; // C style comment.
189
case '/':
190
++CurPtr;
191
return LexLineComment();
192
default:
193
IsAtStartOfStatement = false;
194
return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195
}
196
197
// C Style comment.
198
++CurPtr; // skip the star.
199
const char *CommentTextStart = CurPtr;
200
while (CurPtr != CurBuf.end()) {
201
switch (*CurPtr++) {
202
case '*':
203
// End of the comment?
204
if (*CurPtr != '/')
205
break;
206
// If we have a CommentConsumer, notify it about the comment.
207
if (CommentConsumer) {
208
CommentConsumer->HandleComment(
209
SMLoc::getFromPointer(CommentTextStart),
210
StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211
}
212
++CurPtr; // End the */.
213
return AsmToken(AsmToken::Comment,
214
StringRef(TokStart, CurPtr - TokStart));
215
}
216
}
217
return ReturnError(TokStart, "unterminated comment");
218
}
219
220
/// LexLineComment: Comment: #[^\n]*
221
/// : //[^\n]*
222
AsmToken AsmLexer::LexLineComment() {
223
// Mark This as an end of statement with a body of the
224
// comment. While it would be nicer to leave this two tokens,
225
// backwards compatability with TargetParsers makes keeping this in this form
226
// better.
227
const char *CommentTextStart = CurPtr;
228
int CurChar = getNextChar();
229
while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230
CurChar = getNextChar();
231
const char *NewlinePtr = CurPtr;
232
if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
233
++CurPtr;
234
235
// If we have a CommentConsumer, notify it about the comment.
236
if (CommentConsumer) {
237
CommentConsumer->HandleComment(
238
SMLoc::getFromPointer(CommentTextStart),
239
StringRef(CommentTextStart, NewlinePtr - 1 - CommentTextStart));
240
}
241
242
IsAtStartOfLine = true;
243
// This is a whole line comment. leave newline
244
if (IsAtStartOfStatement)
245
return AsmToken(AsmToken::EndOfStatement,
246
StringRef(TokStart, CurPtr - TokStart));
247
IsAtStartOfStatement = true;
248
249
return AsmToken(AsmToken::EndOfStatement,
250
StringRef(TokStart, CurPtr - 1 - TokStart));
251
}
252
253
static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
254
// Skip case-insensitive ULL, UL, U, L and LL suffixes.
255
if (CurPtr[0] == 'U' || CurPtr[0] == 'u')
256
++CurPtr;
257
if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
258
++CurPtr;
259
if (CurPtr[0] == 'L' || CurPtr[0] == 'l')
260
++CurPtr;
261
}
262
263
// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
264
// integer as a hexadecimal, possibly with leading zeroes.
265
static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
266
bool LexHex) {
267
const char *FirstNonDec = nullptr;
268
const char *LookAhead = CurPtr;
269
while (true) {
270
if (isDigit(*LookAhead)) {
271
++LookAhead;
272
} else {
273
if (!FirstNonDec)
274
FirstNonDec = LookAhead;
275
276
// Keep going if we are looking for a 'h' suffix.
277
if (LexHex && isHexDigit(*LookAhead))
278
++LookAhead;
279
else
280
break;
281
}
282
}
283
bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
284
CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
285
if (isHex)
286
return 16;
287
return DefaultRadix;
288
}
289
290
static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
291
while (hexDigitValue(*CurPtr) < DefaultRadix) {
292
++CurPtr;
293
}
294
return CurPtr;
295
}
296
297
static AsmToken intToken(StringRef Ref, APInt &Value) {
298
if (Value.isIntN(64))
299
return AsmToken(AsmToken::Integer, Ref, Value);
300
return AsmToken(AsmToken::BigNum, Ref, Value);
301
}
302
303
static std::string radixName(unsigned Radix) {
304
switch (Radix) {
305
case 2:
306
return "binary";
307
case 8:
308
return "octal";
309
case 10:
310
return "decimal";
311
case 16:
312
return "hexadecimal";
313
default:
314
return "base-" + std::to_string(Radix);
315
}
316
}
317
318
/// LexDigit: First character is [0-9].
319
/// Local Label: [0-9][:]
320
/// Forward/Backward Label: [0-9][fb]
321
/// Binary integer: 0b[01]+
322
/// Octal integer: 0[0-7]+
323
/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
324
/// Decimal integer: [1-9][0-9]*
325
AsmToken AsmLexer::LexDigit() {
326
// MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
327
// MASM-flavor octal integer: [0-7]+[oOqQ]
328
// MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
329
// MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
330
if (LexMasmIntegers && isdigit(CurPtr[-1])) {
331
const char *FirstNonBinary =
332
(CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
333
const char *FirstNonDecimal =
334
(CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
335
const char *OldCurPtr = CurPtr;
336
while (isHexDigit(*CurPtr)) {
337
switch (*CurPtr) {
338
default:
339
if (!FirstNonDecimal) {
340
FirstNonDecimal = CurPtr;
341
}
342
[[fallthrough]];
343
case '9':
344
case '8':
345
case '7':
346
case '6':
347
case '5':
348
case '4':
349
case '3':
350
case '2':
351
if (!FirstNonBinary) {
352
FirstNonBinary = CurPtr;
353
}
354
break;
355
case '1':
356
case '0':
357
break;
358
}
359
++CurPtr;
360
}
361
if (*CurPtr == '.') {
362
// MASM float literals (other than hex floats) always contain a ".", and
363
// are always written in decimal.
364
++CurPtr;
365
return LexFloatLiteral();
366
}
367
368
if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
369
++CurPtr;
370
return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
371
}
372
373
unsigned Radix = 0;
374
if (*CurPtr == 'h' || *CurPtr == 'H') {
375
// hexadecimal number
376
++CurPtr;
377
Radix = 16;
378
} else if (*CurPtr == 't' || *CurPtr == 'T') {
379
// decimal number
380
++CurPtr;
381
Radix = 10;
382
} else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
383
*CurPtr == 'Q') {
384
// octal number
385
++CurPtr;
386
Radix = 8;
387
} else if (*CurPtr == 'y' || *CurPtr == 'Y') {
388
// binary number
389
++CurPtr;
390
Radix = 2;
391
} else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
392
DefaultRadix < 14 &&
393
(*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
394
Radix = 10;
395
} else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
396
DefaultRadix < 12 &&
397
(*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
398
Radix = 2;
399
}
400
401
if (Radix) {
402
StringRef Result(TokStart, CurPtr - TokStart);
403
APInt Value(128, 0, true);
404
405
if (Result.drop_back().getAsInteger(Radix, Value))
406
return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
407
408
// MSVC accepts and ignores type suffices on integer literals.
409
SkipIgnoredIntegerSuffix(CurPtr);
410
411
return intToken(Result, Value);
412
}
413
414
// default-radix integers, or floating point numbers, fall through
415
CurPtr = OldCurPtr;
416
}
417
418
// MASM default-radix integers: [0-9a-fA-F]+
419
// (All other integer literals have a radix specifier.)
420
if (LexMasmIntegers && UseMasmDefaultRadix) {
421
CurPtr = findLastDigit(CurPtr, 16);
422
StringRef Result(TokStart, CurPtr - TokStart);
423
424
APInt Value(128, 0, true);
425
if (Result.getAsInteger(DefaultRadix, Value)) {
426
return ReturnError(TokStart,
427
"invalid " + radixName(DefaultRadix) + " number");
428
}
429
430
return intToken(Result, Value);
431
}
432
433
// Motorola hex integers: $[0-9a-fA-F]+
434
if (LexMotorolaIntegers && CurPtr[-1] == '$') {
435
const char *NumStart = CurPtr;
436
while (isHexDigit(CurPtr[0]))
437
++CurPtr;
438
439
APInt Result(128, 0);
440
if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
441
return ReturnError(TokStart, "invalid hexadecimal number");
442
443
return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
444
}
445
446
// Motorola binary integers: %[01]+
447
if (LexMotorolaIntegers && CurPtr[-1] == '%') {
448
const char *NumStart = CurPtr;
449
while (*CurPtr == '0' || *CurPtr == '1')
450
++CurPtr;
451
452
APInt Result(128, 0);
453
if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
454
return ReturnError(TokStart, "invalid binary number");
455
456
return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
457
}
458
459
// Decimal integer: [1-9][0-9]*
460
// HLASM-flavour decimal integer: [0-9][0-9]*
461
// FIXME: Later on, support for fb for HLASM has to be added in
462
// as they probably would be needed for asm goto
463
if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
464
unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
465
466
if (!LexHLASMIntegers) {
467
bool IsHex = Radix == 16;
468
// Check for floating point literals.
469
if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
470
if (*CurPtr == '.')
471
++CurPtr;
472
return LexFloatLiteral();
473
}
474
}
475
476
StringRef Result(TokStart, CurPtr - TokStart);
477
478
APInt Value(128, 0, true);
479
if (Result.getAsInteger(Radix, Value))
480
return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
481
482
if (!LexHLASMIntegers)
483
// The darwin/x86 (and x86-64) assembler accepts and ignores type
484
// suffices on integer literals.
485
SkipIgnoredIntegerSuffix(CurPtr);
486
487
return intToken(Result, Value);
488
}
489
490
if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
491
++CurPtr;
492
// See if we actually have "0b" as part of something like "jmp 0b\n"
493
if (!isDigit(CurPtr[0])) {
494
--CurPtr;
495
StringRef Result(TokStart, CurPtr - TokStart);
496
return AsmToken(AsmToken::Integer, Result, 0);
497
}
498
const char *NumStart = CurPtr;
499
while (CurPtr[0] == '0' || CurPtr[0] == '1')
500
++CurPtr;
501
502
// Requires at least one binary digit.
503
if (CurPtr == NumStart)
504
return ReturnError(TokStart, "invalid binary number");
505
506
StringRef Result(TokStart, CurPtr - TokStart);
507
508
APInt Value(128, 0, true);
509
if (Result.substr(2).getAsInteger(2, Value))
510
return ReturnError(TokStart, "invalid binary number");
511
512
// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
513
// suffixes on integer literals.
514
SkipIgnoredIntegerSuffix(CurPtr);
515
516
return intToken(Result, Value);
517
}
518
519
if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
520
++CurPtr;
521
const char *NumStart = CurPtr;
522
while (isHexDigit(CurPtr[0]))
523
++CurPtr;
524
525
// "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
526
// diagnosed by LexHexFloatLiteral).
527
if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
528
return LexHexFloatLiteral(NumStart == CurPtr);
529
530
// Otherwise requires at least one hex digit.
531
if (CurPtr == NumStart)
532
return ReturnError(CurPtr-2, "invalid hexadecimal number");
533
534
APInt Result(128, 0);
535
if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
536
return ReturnError(TokStart, "invalid hexadecimal number");
537
538
// Consume the optional [hH].
539
if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
540
++CurPtr;
541
542
// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
543
// suffixes on integer literals.
544
SkipIgnoredIntegerSuffix(CurPtr);
545
546
return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
547
}
548
549
// Either octal or hexadecimal.
550
APInt Value(128, 0, true);
551
unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
552
StringRef Result(TokStart, CurPtr - TokStart);
553
if (Result.getAsInteger(Radix, Value))
554
return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
555
556
// Consume the [hH].
557
if (Radix == 16)
558
++CurPtr;
559
560
// The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
561
// suffixes on integer literals.
562
SkipIgnoredIntegerSuffix(CurPtr);
563
564
return intToken(Result, Value);
565
}
566
567
/// LexSingleQuote: Integer: 'b'
568
AsmToken AsmLexer::LexSingleQuote() {
569
int CurChar = getNextChar();
570
571
if (LexHLASMStrings)
572
return ReturnError(TokStart, "invalid usage of character literals");
573
574
if (LexMasmStrings) {
575
while (CurChar != EOF) {
576
if (CurChar != '\'') {
577
CurChar = getNextChar();
578
} else if (peekNextChar() == '\'') {
579
// In MASM single-quote strings, doubled single-quotes mean an escaped
580
// single quote, so should be lexed in.
581
(void)getNextChar();
582
CurChar = getNextChar();
583
} else {
584
break;
585
}
586
}
587
if (CurChar == EOF)
588
return ReturnError(TokStart, "unterminated string constant");
589
return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
590
}
591
592
if (CurChar == '\\')
593
CurChar = getNextChar();
594
595
if (CurChar == EOF)
596
return ReturnError(TokStart, "unterminated single quote");
597
598
CurChar = getNextChar();
599
600
if (CurChar != '\'')
601
return ReturnError(TokStart, "single quote way too long");
602
603
// The idea here being that 'c' is basically just an integral
604
// constant.
605
StringRef Res = StringRef(TokStart,CurPtr - TokStart);
606
long long Value;
607
608
if (Res.starts_with("\'\\")) {
609
char theChar = Res[2];
610
switch (theChar) {
611
default: Value = theChar; break;
612
case '\'': Value = '\''; break;
613
case 't': Value = '\t'; break;
614
case 'n': Value = '\n'; break;
615
case 'b': Value = '\b'; break;
616
case 'f': Value = '\f'; break;
617
case 'r': Value = '\r'; break;
618
}
619
} else
620
Value = TokStart[1];
621
622
return AsmToken(AsmToken::Integer, Res, Value);
623
}
624
625
/// LexQuote: String: "..."
626
AsmToken AsmLexer::LexQuote() {
627
int CurChar = getNextChar();
628
if (LexHLASMStrings)
629
return ReturnError(TokStart, "invalid usage of string literals");
630
631
if (LexMasmStrings) {
632
while (CurChar != EOF) {
633
if (CurChar != '"') {
634
CurChar = getNextChar();
635
} else if (peekNextChar() == '"') {
636
// In MASM double-quoted strings, doubled double-quotes mean an escaped
637
// double quote, so should be lexed in.
638
(void)getNextChar();
639
CurChar = getNextChar();
640
} else {
641
break;
642
}
643
}
644
if (CurChar == EOF)
645
return ReturnError(TokStart, "unterminated string constant");
646
return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
647
}
648
649
while (CurChar != '"') {
650
if (CurChar == '\\') {
651
// Allow \", etc.
652
CurChar = getNextChar();
653
}
654
655
if (CurChar == EOF)
656
return ReturnError(TokStart, "unterminated string constant");
657
658
CurChar = getNextChar();
659
}
660
661
return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
662
}
663
664
StringRef AsmLexer::LexUntilEndOfStatement() {
665
TokStart = CurPtr;
666
667
while (!isAtStartOfComment(CurPtr) && // Start of line comment.
668
!isAtStatementSeparator(CurPtr) && // End of statement marker.
669
*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
670
++CurPtr;
671
}
672
return StringRef(TokStart, CurPtr-TokStart);
673
}
674
675
StringRef AsmLexer::LexUntilEndOfLine() {
676
TokStart = CurPtr;
677
678
while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
679
++CurPtr;
680
}
681
return StringRef(TokStart, CurPtr-TokStart);
682
}
683
684
size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
685
bool ShouldSkipSpace) {
686
SaveAndRestore SavedTokenStart(TokStart);
687
SaveAndRestore SavedCurPtr(CurPtr);
688
SaveAndRestore SavedAtStartOfLine(IsAtStartOfLine);
689
SaveAndRestore SavedAtStartOfStatement(IsAtStartOfStatement);
690
SaveAndRestore SavedSkipSpace(SkipSpace, ShouldSkipSpace);
691
SaveAndRestore SavedIsPeeking(IsPeeking, true);
692
std::string SavedErr = getErr();
693
SMLoc SavedErrLoc = getErrLoc();
694
695
size_t ReadCount;
696
for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
697
AsmToken Token = LexToken();
698
699
Buf[ReadCount] = Token;
700
701
if (Token.is(AsmToken::Eof))
702
break;
703
}
704
705
SetError(SavedErrLoc, SavedErr);
706
return ReadCount;
707
}
708
709
bool AsmLexer::isAtStartOfComment(const char *Ptr) {
710
if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
711
return false;
712
713
StringRef CommentString = MAI.getCommentString();
714
715
if (CommentString.size() == 1)
716
return CommentString[0] == Ptr[0];
717
718
// Allow # preprocessor comments also be counted as comments for "##" cases
719
if (CommentString[1] == '#')
720
return CommentString[0] == Ptr[0];
721
722
return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
723
}
724
725
bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
726
return strncmp(Ptr, MAI.getSeparatorString(),
727
strlen(MAI.getSeparatorString())) == 0;
728
}
729
730
AsmToken AsmLexer::LexToken() {
731
TokStart = CurPtr;
732
// This always consumes at least one character.
733
int CurChar = getNextChar();
734
735
if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
736
// If this starts with a '#', this may be a cpp
737
// hash directive and otherwise a line comment.
738
AsmToken TokenBuf[2];
739
MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
740
size_t num = peekTokens(Buf, true);
741
// There cannot be a space preceding this
742
if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
743
TokenBuf[1].is(AsmToken::String)) {
744
CurPtr = TokStart; // reset curPtr;
745
StringRef s = LexUntilEndOfLine();
746
UnLex(TokenBuf[1]);
747
UnLex(TokenBuf[0]);
748
return AsmToken(AsmToken::HashDirective, s);
749
}
750
751
if (MAI.shouldAllowAdditionalComments())
752
return LexLineComment();
753
}
754
755
if (isAtStartOfComment(TokStart))
756
return LexLineComment();
757
758
if (isAtStatementSeparator(TokStart)) {
759
CurPtr += strlen(MAI.getSeparatorString()) - 1;
760
IsAtStartOfLine = true;
761
IsAtStartOfStatement = true;
762
return AsmToken(AsmToken::EndOfStatement,
763
StringRef(TokStart, strlen(MAI.getSeparatorString())));
764
}
765
766
// If we're missing a newline at EOF, make sure we still get an
767
// EndOfStatement token before the Eof token.
768
if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
769
IsAtStartOfLine = true;
770
IsAtStartOfStatement = true;
771
return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
772
}
773
IsAtStartOfLine = false;
774
bool OldIsAtStartOfStatement = IsAtStartOfStatement;
775
IsAtStartOfStatement = false;
776
switch (CurChar) {
777
default:
778
// Handle identifier: [a-zA-Z_.$@#?][a-zA-Z0-9_.$@#?]*
779
// Whether or not the lexer accepts '$', '@', '#' and '?' at the start of
780
// an identifier is target-dependent. These characters are handled in the
781
// respective switch cases.
782
if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
783
return LexIdentifier();
784
785
// Unknown character, emit an error.
786
return ReturnError(TokStart, "invalid character in input");
787
case EOF:
788
if (EndStatementAtEOF) {
789
IsAtStartOfLine = true;
790
IsAtStartOfStatement = true;
791
}
792
return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
793
case 0:
794
case ' ':
795
case '\t':
796
IsAtStartOfStatement = OldIsAtStartOfStatement;
797
while (*CurPtr == ' ' || *CurPtr == '\t')
798
CurPtr++;
799
if (SkipSpace)
800
return LexToken(); // Ignore whitespace.
801
else
802
return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
803
case '\r': {
804
IsAtStartOfLine = true;
805
IsAtStartOfStatement = true;
806
// If this is a CR followed by LF, treat that as one token.
807
if (CurPtr != CurBuf.end() && *CurPtr == '\n')
808
++CurPtr;
809
return AsmToken(AsmToken::EndOfStatement,
810
StringRef(TokStart, CurPtr - TokStart));
811
}
812
case '\n':
813
IsAtStartOfLine = true;
814
IsAtStartOfStatement = true;
815
return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
816
case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
817
case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
818
case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
819
case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
820
case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
821
case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
822
case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
823
case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
824
case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
825
case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
826
case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
827
case '$': {
828
if (LexMotorolaIntegers && isHexDigit(*CurPtr))
829
return LexDigit();
830
if (MAI.doesAllowDollarAtStartOfIdentifier())
831
return LexIdentifier();
832
return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
833
}
834
case '@':
835
if (MAI.doesAllowAtAtStartOfIdentifier())
836
return LexIdentifier();
837
return AsmToken(AsmToken::At, StringRef(TokStart, 1));
838
case '#':
839
if (MAI.doesAllowHashAtStartOfIdentifier())
840
return LexIdentifier();
841
return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
842
case '?':
843
if (MAI.doesAllowQuestionAtStartOfIdentifier())
844
return LexIdentifier();
845
return AsmToken(AsmToken::Question, StringRef(TokStart, 1));
846
case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
847
case '=':
848
if (*CurPtr == '=') {
849
++CurPtr;
850
return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
851
}
852
return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
853
case '-':
854
if (*CurPtr == '>') {
855
++CurPtr;
856
return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
857
}
858
return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
859
case '|':
860
if (*CurPtr == '|') {
861
++CurPtr;
862
return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
863
}
864
return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
865
case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
866
case '&':
867
if (*CurPtr == '&') {
868
++CurPtr;
869
return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
870
}
871
return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
872
case '!':
873
if (*CurPtr == '=') {
874
++CurPtr;
875
return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
876
}
877
return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
878
case '%':
879
if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
880
return LexDigit();
881
}
882
883
if (MAI.hasMipsExpressions()) {
884
AsmToken::TokenKind Operator;
885
unsigned OperatorLength;
886
887
std::tie(Operator, OperatorLength) =
888
StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
889
StringRef(CurPtr))
890
.StartsWith("call16", {AsmToken::PercentCall16, 7})
891
.StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
892
.StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
893
.StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
894
.StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
895
.StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
896
.StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
897
.StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
898
.StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
899
.StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
900
.StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
901
.StartsWith("got", {AsmToken::PercentGot, 4})
902
.StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
903
.StartsWith("higher", {AsmToken::PercentHigher, 7})
904
.StartsWith("highest", {AsmToken::PercentHighest, 8})
905
.StartsWith("hi", {AsmToken::PercentHi, 3})
906
.StartsWith("lo", {AsmToken::PercentLo, 3})
907
.StartsWith("neg", {AsmToken::PercentNeg, 4})
908
.StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
909
.StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
910
.StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
911
.StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
912
.StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
913
.StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
914
.Default({AsmToken::Percent, 1});
915
916
if (Operator != AsmToken::Percent) {
917
CurPtr += OperatorLength - 1;
918
return AsmToken(Operator, StringRef(TokStart, OperatorLength));
919
}
920
}
921
return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
922
case '/':
923
IsAtStartOfStatement = OldIsAtStartOfStatement;
924
return LexSlash();
925
case '\'': return LexSingleQuote();
926
case '"': return LexQuote();
927
case '0': case '1': case '2': case '3': case '4':
928
case '5': case '6': case '7': case '8': case '9':
929
return LexDigit();
930
case '<':
931
switch (*CurPtr) {
932
case '<':
933
++CurPtr;
934
return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
935
case '=':
936
++CurPtr;
937
return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
938
case '>':
939
++CurPtr;
940
return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
941
default:
942
return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
943
}
944
case '>':
945
switch (*CurPtr) {
946
case '>':
947
++CurPtr;
948
return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
949
case '=':
950
++CurPtr;
951
return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
952
default:
953
return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
954
}
955
956
// TODO: Quoted identifiers (objc methods etc)
957
// local labels: [0-9][:]
958
// Forward/backward labels: [0-9][fb]
959
// Integers, fp constants, character constants.
960
}
961
}
962
963