Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
freebsd
GitHub Repository: freebsd/freebsd-src
Path: blob/main/contrib/llvm-project/clang/lib/Lex/LiteralSupport.cpp
35233 views
1
//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the NumericLiteralParser, CharLiteralParser, and
10
// StringLiteralParser interfaces.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "clang/Lex/LiteralSupport.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/LangOptions.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/TargetInfo.h"
19
#include "clang/Lex/LexDiagnostic.h"
20
#include "clang/Lex/Lexer.h"
21
#include "clang/Lex/Preprocessor.h"
22
#include "clang/Lex/Token.h"
23
#include "llvm/ADT/APInt.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/StringExtras.h"
26
#include "llvm/ADT/StringSwitch.h"
27
#include "llvm/Support/ConvertUTF.h"
28
#include "llvm/Support/Error.h"
29
#include "llvm/Support/ErrorHandling.h"
30
#include "llvm/Support/Unicode.h"
31
#include <algorithm>
32
#include <cassert>
33
#include <cstddef>
34
#include <cstdint>
35
#include <cstring>
36
#include <string>
37
38
using namespace clang;
39
40
static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
41
switch (kind) {
42
default: llvm_unreachable("Unknown token type!");
43
case tok::char_constant:
44
case tok::string_literal:
45
case tok::utf8_char_constant:
46
case tok::utf8_string_literal:
47
return Target.getCharWidth();
48
case tok::wide_char_constant:
49
case tok::wide_string_literal:
50
return Target.getWCharWidth();
51
case tok::utf16_char_constant:
52
case tok::utf16_string_literal:
53
return Target.getChar16Width();
54
case tok::utf32_char_constant:
55
case tok::utf32_string_literal:
56
return Target.getChar32Width();
57
}
58
}
59
60
static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
61
switch (kind) {
62
default:
63
llvm_unreachable("Unknown token type!");
64
case tok::char_constant:
65
case tok::string_literal:
66
return 0;
67
case tok::utf8_char_constant:
68
case tok::utf8_string_literal:
69
return 2;
70
case tok::wide_char_constant:
71
case tok::wide_string_literal:
72
case tok::utf16_char_constant:
73
case tok::utf16_string_literal:
74
case tok::utf32_char_constant:
75
case tok::utf32_string_literal:
76
return 1;
77
}
78
}
79
80
static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
81
FullSourceLoc TokLoc,
82
const char *TokBegin,
83
const char *TokRangeBegin,
84
const char *TokRangeEnd) {
85
SourceLocation Begin =
86
Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
87
TokLoc.getManager(), Features);
88
SourceLocation End =
89
Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
90
TokLoc.getManager(), Features);
91
return CharSourceRange::getCharRange(Begin, End);
92
}
93
94
/// Produce a diagnostic highlighting some portion of a literal.
95
///
96
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
97
/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
98
/// a substring of a spelling buffer for the token beginning at \p TokBegin.
99
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
100
const LangOptions &Features, FullSourceLoc TokLoc,
101
const char *TokBegin, const char *TokRangeBegin,
102
const char *TokRangeEnd, unsigned DiagID) {
103
SourceLocation Begin =
104
Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
105
TokLoc.getManager(), Features);
106
return Diags->Report(Begin, DiagID) <<
107
MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
108
}
109
110
static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
111
switch (Escape) {
112
case '\'':
113
case '"':
114
case '?':
115
case '\\':
116
case 'a':
117
case 'b':
118
case 'f':
119
case 'n':
120
case 'r':
121
case 't':
122
case 'v':
123
return true;
124
}
125
return false;
126
}
127
128
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
129
/// either a character or a string literal.
130
static unsigned ProcessCharEscape(const char *ThisTokBegin,
131
const char *&ThisTokBuf,
132
const char *ThisTokEnd, bool &HadError,
133
FullSourceLoc Loc, unsigned CharWidth,
134
DiagnosticsEngine *Diags,
135
const LangOptions &Features,
136
StringLiteralEvalMethod EvalMethod) {
137
const char *EscapeBegin = ThisTokBuf;
138
bool Delimited = false;
139
bool EndDelimiterFound = false;
140
141
// Skip the '\' char.
142
++ThisTokBuf;
143
144
// We know that this character can't be off the end of the buffer, because
145
// that would have been \", which would not have been the end of string.
146
unsigned ResultChar = *ThisTokBuf++;
147
char Escape = ResultChar;
148
switch (ResultChar) {
149
// These map to themselves.
150
case '\\': case '\'': case '"': case '?': break;
151
152
// These have fixed mappings.
153
case 'a':
154
// TODO: K&R: the meaning of '\\a' is different in traditional C
155
ResultChar = 7;
156
break;
157
case 'b':
158
ResultChar = 8;
159
break;
160
case 'e':
161
if (Diags)
162
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
163
diag::ext_nonstandard_escape) << "e";
164
ResultChar = 27;
165
break;
166
case 'E':
167
if (Diags)
168
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
169
diag::ext_nonstandard_escape) << "E";
170
ResultChar = 27;
171
break;
172
case 'f':
173
ResultChar = 12;
174
break;
175
case 'n':
176
ResultChar = 10;
177
break;
178
case 'r':
179
ResultChar = 13;
180
break;
181
case 't':
182
ResultChar = 9;
183
break;
184
case 'v':
185
ResultChar = 11;
186
break;
187
case 'x': { // Hex escape.
188
ResultChar = 0;
189
if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
190
Delimited = true;
191
ThisTokBuf++;
192
if (*ThisTokBuf == '}') {
193
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194
diag::err_delimited_escape_empty);
195
return ResultChar;
196
}
197
} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
198
if (Diags)
199
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
200
diag::err_hex_escape_no_digits) << "x";
201
return ResultChar;
202
}
203
204
// Hex escapes are a maximal series of hex digits.
205
bool Overflow = false;
206
for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
207
if (Delimited && *ThisTokBuf == '}') {
208
ThisTokBuf++;
209
EndDelimiterFound = true;
210
break;
211
}
212
int CharVal = llvm::hexDigitValue(*ThisTokBuf);
213
if (CharVal == -1) {
214
// Non delimited hex escape sequences stop at the first non-hex digit.
215
if (!Delimited)
216
break;
217
HadError = true;
218
if (Diags)
219
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
220
diag::err_delimited_escape_invalid)
221
<< StringRef(ThisTokBuf, 1);
222
continue;
223
}
224
// About to shift out a digit?
225
if (ResultChar & 0xF0000000)
226
Overflow = true;
227
ResultChar <<= 4;
228
ResultChar |= CharVal;
229
}
230
// See if any bits will be truncated when evaluated as a character.
231
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
232
Overflow = true;
233
ResultChar &= ~0U >> (32-CharWidth);
234
}
235
236
// Check for overflow.
237
if (!HadError && Overflow) { // Too many digits to fit in
238
HadError = true;
239
if (Diags)
240
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
241
diag::err_escape_too_large)
242
<< 0;
243
}
244
break;
245
}
246
case '0': case '1': case '2': case '3':
247
case '4': case '5': case '6': case '7': {
248
// Octal escapes.
249
--ThisTokBuf;
250
ResultChar = 0;
251
252
// Octal escapes are a series of octal digits with maximum length 3.
253
// "\0123" is a two digit sequence equal to "\012" "3".
254
unsigned NumDigits = 0;
255
do {
256
ResultChar <<= 3;
257
ResultChar |= *ThisTokBuf++ - '0';
258
++NumDigits;
259
} while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
260
ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
261
262
// Check for overflow. Reject '\777', but not L'\777'.
263
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
264
if (Diags)
265
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
266
diag::err_escape_too_large) << 1;
267
ResultChar &= ~0U >> (32-CharWidth);
268
}
269
break;
270
}
271
case 'o': {
272
bool Overflow = false;
273
if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
274
HadError = true;
275
if (Diags)
276
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
277
diag::err_delimited_escape_missing_brace)
278
<< "o";
279
280
break;
281
}
282
ResultChar = 0;
283
Delimited = true;
284
++ThisTokBuf;
285
if (*ThisTokBuf == '}') {
286
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287
diag::err_delimited_escape_empty);
288
return ResultChar;
289
}
290
291
while (ThisTokBuf != ThisTokEnd) {
292
if (*ThisTokBuf == '}') {
293
EndDelimiterFound = true;
294
ThisTokBuf++;
295
break;
296
}
297
if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
298
HadError = true;
299
if (Diags)
300
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
301
diag::err_delimited_escape_invalid)
302
<< StringRef(ThisTokBuf, 1);
303
ThisTokBuf++;
304
continue;
305
}
306
// Check if one of the top three bits is set before shifting them out.
307
if (ResultChar & 0xE0000000)
308
Overflow = true;
309
310
ResultChar <<= 3;
311
ResultChar |= *ThisTokBuf++ - '0';
312
}
313
// Check for overflow. Reject '\777', but not L'\777'.
314
if (!HadError &&
315
(Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
316
HadError = true;
317
if (Diags)
318
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
319
diag::err_escape_too_large)
320
<< 1;
321
ResultChar &= ~0U >> (32 - CharWidth);
322
}
323
break;
324
}
325
// Otherwise, these are not valid escapes.
326
case '(': case '{': case '[': case '%':
327
// GCC accepts these as extensions. We warn about them as such though.
328
if (Diags)
329
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
330
diag::ext_nonstandard_escape)
331
<< std::string(1, ResultChar);
332
break;
333
default:
334
if (!Diags)
335
break;
336
337
if (isPrintable(ResultChar))
338
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
339
diag::ext_unknown_escape)
340
<< std::string(1, ResultChar);
341
else
342
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
343
diag::ext_unknown_escape)
344
<< "x" + llvm::utohexstr(ResultChar);
345
break;
346
}
347
348
if (Delimited && Diags) {
349
if (!EndDelimiterFound)
350
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
351
diag::err_expected)
352
<< tok::r_brace;
353
else if (!HadError) {
354
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
355
Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
356
: diag::ext_delimited_escape_sequence)
357
<< /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
358
}
359
}
360
361
if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
362
!IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
363
Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
364
diag::err_unevaluated_string_invalid_escape_sequence)
365
<< StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
366
HadError = true;
367
}
368
369
return ResultChar;
370
}
371
372
static void appendCodePoint(unsigned Codepoint,
373
llvm::SmallVectorImpl<char> &Str) {
374
char ResultBuf[4];
375
char *ResultPtr = ResultBuf;
376
if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
377
Str.append(ResultBuf, ResultPtr);
378
}
379
380
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
381
for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
382
if (*I != '\\') {
383
Buf.push_back(*I);
384
continue;
385
}
386
387
++I;
388
char Kind = *I;
389
++I;
390
391
assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
392
uint32_t CodePoint = 0;
393
394
if (Kind == 'u' && *I == '{') {
395
for (++I; *I != '}'; ++I) {
396
unsigned Value = llvm::hexDigitValue(*I);
397
assert(Value != -1U);
398
CodePoint <<= 4;
399
CodePoint += Value;
400
}
401
appendCodePoint(CodePoint, Buf);
402
continue;
403
}
404
405
if (Kind == 'N') {
406
assert(*I == '{');
407
++I;
408
auto Delim = std::find(I, Input.end(), '}');
409
assert(Delim != Input.end());
410
StringRef Name(I, std::distance(I, Delim));
411
std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
412
llvm::sys::unicode::nameToCodepointLooseMatching(Name);
413
assert(Res && "could not find a codepoint that was previously found");
414
CodePoint = Res->CodePoint;
415
assert(CodePoint != 0xFFFFFFFF);
416
appendCodePoint(CodePoint, Buf);
417
I = Delim;
418
continue;
419
}
420
421
unsigned NumHexDigits;
422
if (Kind == 'u')
423
NumHexDigits = 4;
424
else
425
NumHexDigits = 8;
426
427
assert(I + NumHexDigits <= E);
428
429
for (; NumHexDigits != 0; ++I, --NumHexDigits) {
430
unsigned Value = llvm::hexDigitValue(*I);
431
assert(Value != -1U);
432
433
CodePoint <<= 4;
434
CodePoint += Value;
435
}
436
437
appendCodePoint(CodePoint, Buf);
438
--I;
439
}
440
}
441
442
bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
443
const LangOptions &LO) {
444
return LO.MicrosoftExt &&
445
(K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||
446
K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||
447
K == tok::kw___FUNCDNAME__);
448
}
449
450
bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
451
return tok::isStringLiteral(Tok.getKind()) ||
452
isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);
453
}
454
455
static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
456
const char *&ThisTokBuf,
457
const char *ThisTokEnd, uint32_t &UcnVal,
458
unsigned short &UcnLen, bool &Delimited,
459
FullSourceLoc Loc, DiagnosticsEngine *Diags,
460
const LangOptions &Features,
461
bool in_char_string_literal = false) {
462
const char *UcnBegin = ThisTokBuf;
463
bool HasError = false;
464
bool EndDelimiterFound = false;
465
466
// Skip the '\u' char's.
467
ThisTokBuf += 2;
468
Delimited = false;
469
if (UcnBegin[1] == 'u' && in_char_string_literal &&
470
ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
471
Delimited = true;
472
ThisTokBuf++;
473
} else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
474
if (Diags)
475
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
476
diag::err_hex_escape_no_digits)
477
<< StringRef(&ThisTokBuf[-1], 1);
478
return false;
479
}
480
UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
481
482
bool Overflow = false;
483
unsigned short Count = 0;
484
for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
485
++ThisTokBuf) {
486
if (Delimited && *ThisTokBuf == '}') {
487
++ThisTokBuf;
488
EndDelimiterFound = true;
489
break;
490
}
491
int CharVal = llvm::hexDigitValue(*ThisTokBuf);
492
if (CharVal == -1) {
493
HasError = true;
494
if (!Delimited)
495
break;
496
if (Diags) {
497
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
498
diag::err_delimited_escape_invalid)
499
<< StringRef(ThisTokBuf, 1);
500
}
501
Count++;
502
continue;
503
}
504
if (UcnVal & 0xF0000000) {
505
Overflow = true;
506
continue;
507
}
508
UcnVal <<= 4;
509
UcnVal |= CharVal;
510
Count++;
511
}
512
513
if (Overflow) {
514
if (Diags)
515
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
516
diag::err_escape_too_large)
517
<< 0;
518
return false;
519
}
520
521
if (Delimited && !EndDelimiterFound) {
522
if (Diags) {
523
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
524
diag::err_expected)
525
<< tok::r_brace;
526
}
527
return false;
528
}
529
530
// If we didn't consume the proper number of digits, there is a problem.
531
if (Count == 0 || (!Delimited && Count != UcnLen)) {
532
if (Diags)
533
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
534
Delimited ? diag::err_delimited_escape_empty
535
: diag::err_ucn_escape_incomplete);
536
return false;
537
}
538
return !HasError;
539
}
540
541
static void DiagnoseInvalidUnicodeCharacterName(
542
DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
543
const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
544
llvm::StringRef Name) {
545
546
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
547
diag::err_invalid_ucn_name)
548
<< Name;
549
550
namespace u = llvm::sys::unicode;
551
552
std::optional<u::LooseMatchingResult> Res =
553
u::nameToCodepointLooseMatching(Name);
554
if (Res) {
555
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
556
diag::note_invalid_ucn_name_loose_matching)
557
<< FixItHint::CreateReplacement(
558
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
559
TokRangeEnd),
560
Res->Name);
561
return;
562
}
563
564
unsigned Distance = 0;
565
SmallVector<u::MatchForCodepointName> Matches =
566
u::nearestMatchesForCodepointName(Name, 5);
567
assert(!Matches.empty() && "No unicode characters found");
568
569
for (const auto &Match : Matches) {
570
if (Distance == 0)
571
Distance = Match.Distance;
572
if (std::max(Distance, Match.Distance) -
573
std::min(Distance, Match.Distance) >
574
3)
575
break;
576
Distance = Match.Distance;
577
578
std::string Str;
579
llvm::UTF32 V = Match.Value;
580
bool Converted =
581
llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
582
(void)Converted;
583
assert(Converted && "Found a match wich is not a unicode character");
584
585
Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
586
diag::note_invalid_ucn_name_candidate)
587
<< Match.Name << llvm::utohexstr(Match.Value)
588
<< Str // FIXME: Fix the rendering of non printable characters
589
<< FixItHint::CreateReplacement(
590
MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
591
TokRangeEnd),
592
Match.Name);
593
}
594
}
595
596
static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
597
const char *&ThisTokBuf,
598
const char *ThisTokEnd, uint32_t &UcnVal,
599
unsigned short &UcnLen, FullSourceLoc Loc,
600
DiagnosticsEngine *Diags,
601
const LangOptions &Features) {
602
const char *UcnBegin = ThisTokBuf;
603
assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
604
ThisTokBuf += 2;
605
if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
606
if (Diags) {
607
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
608
diag::err_delimited_escape_missing_brace)
609
<< StringRef(&ThisTokBuf[-1], 1);
610
}
611
return false;
612
}
613
ThisTokBuf++;
614
const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
615
return C == '}' || isVerticalWhitespace(C);
616
});
617
bool Incomplete = ClosingBrace == ThisTokEnd;
618
bool Empty = ClosingBrace == ThisTokBuf;
619
if (Incomplete || Empty) {
620
if (Diags) {
621
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
622
Incomplete ? diag::err_ucn_escape_incomplete
623
: diag::err_delimited_escape_empty)
624
<< StringRef(&UcnBegin[1], 1);
625
}
626
ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
627
return false;
628
}
629
StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
630
ThisTokBuf = ClosingBrace + 1;
631
std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
632
if (!Res) {
633
if (Diags)
634
DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
635
&UcnBegin[3], ClosingBrace, Name);
636
return false;
637
}
638
UcnVal = *Res;
639
UcnLen = UcnVal > 0xFFFF ? 8 : 4;
640
return true;
641
}
642
643
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
644
/// return the UTF32.
645
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
646
const char *ThisTokEnd, uint32_t &UcnVal,
647
unsigned short &UcnLen, FullSourceLoc Loc,
648
DiagnosticsEngine *Diags,
649
const LangOptions &Features,
650
bool in_char_string_literal = false) {
651
652
bool HasError;
653
const char *UcnBegin = ThisTokBuf;
654
bool IsDelimitedEscapeSequence = false;
655
bool IsNamedEscapeSequence = false;
656
if (ThisTokBuf[1] == 'N') {
657
IsNamedEscapeSequence = true;
658
HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
659
UcnVal, UcnLen, Loc, Diags, Features);
660
} else {
661
HasError =
662
!ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
663
UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
664
Features, in_char_string_literal);
665
}
666
if (HasError)
667
return false;
668
669
// Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
670
if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
671
UcnVal > 0x10FFFF) { // maximum legal UTF32 value
672
if (Diags)
673
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
674
diag::err_ucn_escape_invalid);
675
return false;
676
}
677
678
// C23 and C++11 allow UCNs that refer to control characters
679
// and basic source characters inside character and string literals
680
if (UcnVal < 0xa0 &&
681
// $, @, ` are allowed in all language modes
682
(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
683
bool IsError =
684
(!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);
685
if (Diags) {
686
char BasicSCSChar = UcnVal;
687
if (UcnVal >= 0x20 && UcnVal < 0x7f)
688
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
689
IsError ? diag::err_ucn_escape_basic_scs
690
: Features.CPlusPlus
691
? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
692
: diag::warn_c23_compat_literal_ucn_escape_basic_scs)
693
<< StringRef(&BasicSCSChar, 1);
694
else
695
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
696
IsError ? diag::err_ucn_control_character
697
: Features.CPlusPlus
698
? diag::warn_cxx98_compat_literal_ucn_control_character
699
: diag::warn_c23_compat_literal_ucn_control_character);
700
}
701
if (IsError)
702
return false;
703
}
704
705
if (!Features.CPlusPlus && !Features.C99 && Diags)
706
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
707
diag::warn_ucn_not_valid_in_c89_literal);
708
709
if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
710
Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
711
Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
712
: diag::ext_delimited_escape_sequence)
713
<< (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
714
715
return true;
716
}
717
718
/// MeasureUCNEscape - Determine the number of bytes within the resulting string
719
/// which this UCN will occupy.
720
static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
721
const char *ThisTokEnd, unsigned CharByteWidth,
722
const LangOptions &Features, bool &HadError) {
723
// UTF-32: 4 bytes per escape.
724
if (CharByteWidth == 4)
725
return 4;
726
727
uint32_t UcnVal = 0;
728
unsigned short UcnLen = 0;
729
FullSourceLoc Loc;
730
731
if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
732
UcnLen, Loc, nullptr, Features, true)) {
733
HadError = true;
734
return 0;
735
}
736
737
// UTF-16: 2 bytes for BMP, 4 bytes otherwise.
738
if (CharByteWidth == 2)
739
return UcnVal <= 0xFFFF ? 2 : 4;
740
741
// UTF-8.
742
if (UcnVal < 0x80)
743
return 1;
744
if (UcnVal < 0x800)
745
return 2;
746
if (UcnVal < 0x10000)
747
return 3;
748
return 4;
749
}
750
751
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
752
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
753
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
754
/// we will likely rework our support for UCN's.
755
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
756
const char *ThisTokEnd,
757
char *&ResultBuf, bool &HadError,
758
FullSourceLoc Loc, unsigned CharByteWidth,
759
DiagnosticsEngine *Diags,
760
const LangOptions &Features) {
761
typedef uint32_t UTF32;
762
UTF32 UcnVal = 0;
763
unsigned short UcnLen = 0;
764
if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
765
Loc, Diags, Features, true)) {
766
HadError = true;
767
return;
768
}
769
770
assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
771
"only character widths of 1, 2, or 4 bytes supported");
772
773
(void)UcnLen;
774
assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
775
776
if (CharByteWidth == 4) {
777
// FIXME: Make the type of the result buffer correct instead of
778
// using reinterpret_cast.
779
llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
780
*ResultPtr = UcnVal;
781
ResultBuf += 4;
782
return;
783
}
784
785
if (CharByteWidth == 2) {
786
// FIXME: Make the type of the result buffer correct instead of
787
// using reinterpret_cast.
788
llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
789
790
if (UcnVal <= (UTF32)0xFFFF) {
791
*ResultPtr = UcnVal;
792
ResultBuf += 2;
793
return;
794
}
795
796
// Convert to UTF16.
797
UcnVal -= 0x10000;
798
*ResultPtr = 0xD800 + (UcnVal >> 10);
799
*(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
800
ResultBuf += 4;
801
return;
802
}
803
804
assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
805
806
// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
807
// The conversion below was inspired by:
808
// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
809
// First, we determine how many bytes the result will require.
810
typedef uint8_t UTF8;
811
812
unsigned short bytesToWrite = 0;
813
if (UcnVal < (UTF32)0x80)
814
bytesToWrite = 1;
815
else if (UcnVal < (UTF32)0x800)
816
bytesToWrite = 2;
817
else if (UcnVal < (UTF32)0x10000)
818
bytesToWrite = 3;
819
else
820
bytesToWrite = 4;
821
822
const unsigned byteMask = 0xBF;
823
const unsigned byteMark = 0x80;
824
825
// Once the bits are split out into bytes of UTF8, this is a mask OR-ed
826
// into the first byte, depending on how many bytes follow.
827
static const UTF8 firstByteMark[5] = {
828
0x00, 0x00, 0xC0, 0xE0, 0xF0
829
};
830
// Finally, we write the bytes into ResultBuf.
831
ResultBuf += bytesToWrite;
832
switch (bytesToWrite) { // note: everything falls through.
833
case 4:
834
*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
835
[[fallthrough]];
836
case 3:
837
*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
838
[[fallthrough]];
839
case 2:
840
*--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
841
[[fallthrough]];
842
case 1:
843
*--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
844
}
845
// Update the buffer.
846
ResultBuf += bytesToWrite;
847
}
848
849
/// integer-constant: [C99 6.4.4.1]
850
/// decimal-constant integer-suffix
851
/// octal-constant integer-suffix
852
/// hexadecimal-constant integer-suffix
853
/// binary-literal integer-suffix [GNU, C++1y]
854
/// user-defined-integer-literal: [C++11 lex.ext]
855
/// decimal-literal ud-suffix
856
/// octal-literal ud-suffix
857
/// hexadecimal-literal ud-suffix
858
/// binary-literal ud-suffix [GNU, C++1y]
859
/// decimal-constant:
860
/// nonzero-digit
861
/// decimal-constant digit
862
/// octal-constant:
863
/// 0
864
/// octal-constant octal-digit
865
/// hexadecimal-constant:
866
/// hexadecimal-prefix hexadecimal-digit
867
/// hexadecimal-constant hexadecimal-digit
868
/// hexadecimal-prefix: one of
869
/// 0x 0X
870
/// binary-literal:
871
/// 0b binary-digit
872
/// 0B binary-digit
873
/// binary-literal binary-digit
874
/// integer-suffix:
875
/// unsigned-suffix [long-suffix]
876
/// unsigned-suffix [long-long-suffix]
877
/// long-suffix [unsigned-suffix]
878
/// long-long-suffix [unsigned-sufix]
879
/// nonzero-digit:
880
/// 1 2 3 4 5 6 7 8 9
881
/// octal-digit:
882
/// 0 1 2 3 4 5 6 7
883
/// hexadecimal-digit:
884
/// 0 1 2 3 4 5 6 7 8 9
885
/// a b c d e f
886
/// A B C D E F
887
/// binary-digit:
888
/// 0
889
/// 1
890
/// unsigned-suffix: one of
891
/// u U
892
/// long-suffix: one of
893
/// l L
894
/// long-long-suffix: one of
895
/// ll LL
896
///
897
/// floating-constant: [C99 6.4.4.2]
898
/// TODO: add rules...
899
///
900
NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
901
SourceLocation TokLoc,
902
const SourceManager &SM,
903
const LangOptions &LangOpts,
904
const TargetInfo &Target,
905
DiagnosticsEngine &Diags)
906
: SM(SM), LangOpts(LangOpts), Diags(Diags),
907
ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
908
909
s = DigitsBegin = ThisTokBegin;
910
saw_exponent = false;
911
saw_period = false;
912
saw_ud_suffix = false;
913
saw_fixed_point_suffix = false;
914
isLong = false;
915
isUnsigned = false;
916
isLongLong = false;
917
isSizeT = false;
918
isHalf = false;
919
isFloat = false;
920
isImaginary = false;
921
isFloat16 = false;
922
isFloat128 = false;
923
MicrosoftInteger = 0;
924
isFract = false;
925
isAccum = false;
926
hadError = false;
927
isBitInt = false;
928
929
// This routine assumes that the range begin/end matches the regex for integer
930
// and FP constants (specifically, the 'pp-number' regex), and assumes that
931
// the byte at "*end" is both valid and not part of the regex. Because of
932
// this, it doesn't have to check for 'overscan' in various places.
933
// Note: For HLSL, the end token is allowed to be '.' which would be in the
934
// 'pp-number' regex. This is required to support vector swizzles on numeric
935
// constants (i.e. 1.xx or 1.5f.rrr).
936
if (isPreprocessingNumberBody(*ThisTokEnd) &&
937
!(LangOpts.HLSL && *ThisTokEnd == '.')) {
938
Diags.Report(TokLoc, diag::err_lexing_numeric);
939
hadError = true;
940
return;
941
}
942
943
if (*s == '0') { // parse radix
944
ParseNumberStartingWithZero(TokLoc);
945
if (hadError)
946
return;
947
} else { // the first digit is non-zero
948
radix = 10;
949
s = SkipDigits(s);
950
if (s == ThisTokEnd) {
951
// Done.
952
} else {
953
ParseDecimalOrOctalCommon(TokLoc);
954
if (hadError)
955
return;
956
}
957
}
958
959
SuffixBegin = s;
960
checkSeparator(TokLoc, s, CSK_AfterDigits);
961
962
// Initial scan to lookahead for fixed point suffix.
963
if (LangOpts.FixedPoint) {
964
for (const char *c = s; c != ThisTokEnd; ++c) {
965
if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
966
saw_fixed_point_suffix = true;
967
break;
968
}
969
}
970
}
971
972
// Parse the suffix. At this point we can classify whether we have an FP or
973
// integer constant.
974
bool isFixedPointConstant = isFixedPointLiteral();
975
bool isFPConstant = isFloatingLiteral();
976
bool HasSize = false;
977
bool DoubleUnderscore = false;
978
979
// Loop over all of the characters of the suffix. If we see something bad,
980
// we break out of the loop.
981
for (; s != ThisTokEnd; ++s) {
982
switch (*s) {
983
case 'R':
984
case 'r':
985
if (!LangOpts.FixedPoint)
986
break;
987
if (isFract || isAccum) break;
988
if (!(saw_period || saw_exponent)) break;
989
isFract = true;
990
continue;
991
case 'K':
992
case 'k':
993
if (!LangOpts.FixedPoint)
994
break;
995
if (isFract || isAccum) break;
996
if (!(saw_period || saw_exponent)) break;
997
isAccum = true;
998
continue;
999
case 'h': // FP Suffix for "half".
1000
case 'H':
1001
// OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
1002
if (!(LangOpts.Half || LangOpts.FixedPoint))
1003
break;
1004
if (isIntegerLiteral()) break; // Error for integer constant.
1005
if (HasSize)
1006
break;
1007
HasSize = true;
1008
isHalf = true;
1009
continue; // Success.
1010
case 'f': // FP Suffix for "float"
1011
case 'F':
1012
if (!isFPConstant) break; // Error for integer constant.
1013
if (HasSize)
1014
break;
1015
HasSize = true;
1016
1017
// CUDA host and device may have different _Float16 support, therefore
1018
// allows f16 literals to avoid false alarm.
1019
// When we compile for OpenMP target offloading on NVPTX, f16 suffix
1020
// should also be supported.
1021
// ToDo: more precise check for CUDA.
1022
// TODO: AMDGPU might also support it in the future.
1023
if ((Target.hasFloat16Type() || LangOpts.CUDA ||
1024
(LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1025
s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
1026
s += 2; // success, eat up 2 characters.
1027
isFloat16 = true;
1028
continue;
1029
}
1030
1031
isFloat = true;
1032
continue; // Success.
1033
case 'q': // FP Suffix for "__float128"
1034
case 'Q':
1035
if (!isFPConstant) break; // Error for integer constant.
1036
if (HasSize)
1037
break;
1038
HasSize = true;
1039
isFloat128 = true;
1040
continue; // Success.
1041
case 'u':
1042
case 'U':
1043
if (isFPConstant) break; // Error for floating constant.
1044
if (isUnsigned) break; // Cannot be repeated.
1045
isUnsigned = true;
1046
continue; // Success.
1047
case 'l':
1048
case 'L':
1049
if (HasSize)
1050
break;
1051
HasSize = true;
1052
1053
// Check for long long. The L's need to be adjacent and the same case.
1054
if (s[1] == s[0]) {
1055
assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
1056
if (isFPConstant) break; // long long invalid for floats.
1057
isLongLong = true;
1058
++s; // Eat both of them.
1059
} else {
1060
isLong = true;
1061
}
1062
continue; // Success.
1063
case 'z':
1064
case 'Z':
1065
if (isFPConstant)
1066
break; // Invalid for floats.
1067
if (HasSize)
1068
break;
1069
HasSize = true;
1070
isSizeT = true;
1071
continue;
1072
case 'i':
1073
case 'I':
1074
if (LangOpts.MicrosoftExt && !isFPConstant) {
1075
// Allow i8, i16, i32, and i64. First, look ahead and check if
1076
// suffixes are Microsoft integers and not the imaginary unit.
1077
uint8_t Bits = 0;
1078
size_t ToSkip = 0;
1079
switch (s[1]) {
1080
case '8': // i8 suffix
1081
Bits = 8;
1082
ToSkip = 2;
1083
break;
1084
case '1':
1085
if (s[2] == '6') { // i16 suffix
1086
Bits = 16;
1087
ToSkip = 3;
1088
}
1089
break;
1090
case '3':
1091
if (s[2] == '2') { // i32 suffix
1092
Bits = 32;
1093
ToSkip = 3;
1094
}
1095
break;
1096
case '6':
1097
if (s[2] == '4') { // i64 suffix
1098
Bits = 64;
1099
ToSkip = 3;
1100
}
1101
break;
1102
default:
1103
break;
1104
}
1105
if (Bits) {
1106
if (HasSize)
1107
break;
1108
HasSize = true;
1109
MicrosoftInteger = Bits;
1110
s += ToSkip;
1111
assert(s <= ThisTokEnd && "didn't maximally munch?");
1112
break;
1113
}
1114
}
1115
[[fallthrough]];
1116
case 'j':
1117
case 'J':
1118
if (isImaginary) break; // Cannot be repeated.
1119
isImaginary = true;
1120
continue; // Success.
1121
case '_':
1122
if (isFPConstant)
1123
break; // Invalid for floats
1124
if (HasSize)
1125
break;
1126
// There is currently no way to reach this with DoubleUnderscore set.
1127
// If new double underscope literals are added handle it here as above.
1128
assert(!DoubleUnderscore && "unhandled double underscore case");
1129
if (LangOpts.CPlusPlus && s + 2 < ThisTokEnd &&
1130
s[1] == '_') { // s + 2 < ThisTokEnd to ensure some character exists
1131
// after __
1132
DoubleUnderscore = true;
1133
s += 2; // Skip both '_'
1134
if (s + 1 < ThisTokEnd &&
1135
(*s == 'u' || *s == 'U')) { // Ensure some character after 'u'/'U'
1136
isUnsigned = true;
1137
++s;
1138
}
1139
if (s + 1 < ThisTokEnd &&
1140
((*s == 'w' && *(++s) == 'b') || (*s == 'W' && *(++s) == 'B'))) {
1141
isBitInt = true;
1142
HasSize = true;
1143
continue;
1144
}
1145
}
1146
break;
1147
case 'w':
1148
case 'W':
1149
if (isFPConstant)
1150
break; // Invalid for floats.
1151
if (HasSize)
1152
break; // Invalid if we already have a size for the literal.
1153
1154
// wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1155
// explicitly do not support the suffix in C++ as an extension because a
1156
// library-based UDL that resolves to a library type may be more
1157
// appropriate there. The same rules apply for __wb/__WB.
1158
if ((!LangOpts.CPlusPlus || DoubleUnderscore) && s + 1 < ThisTokEnd &&
1159
((s[0] == 'w' && s[1] == 'b') || (s[0] == 'W' && s[1] == 'B'))) {
1160
isBitInt = true;
1161
HasSize = true;
1162
++s; // Skip both characters (2nd char skipped on continue).
1163
continue; // Success.
1164
}
1165
}
1166
// If we reached here, there was an error or a ud-suffix.
1167
break;
1168
}
1169
1170
// "i", "if", and "il" are user-defined suffixes in C++1y.
1171
if (s != ThisTokEnd || isImaginary) {
1172
// FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1173
expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1174
if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1175
if (!isImaginary) {
1176
// Any suffix pieces we might have parsed are actually part of the
1177
// ud-suffix.
1178
isLong = false;
1179
isUnsigned = false;
1180
isLongLong = false;
1181
isSizeT = false;
1182
isFloat = false;
1183
isFloat16 = false;
1184
isHalf = false;
1185
isImaginary = false;
1186
isBitInt = false;
1187
MicrosoftInteger = 0;
1188
saw_fixed_point_suffix = false;
1189
isFract = false;
1190
isAccum = false;
1191
}
1192
1193
saw_ud_suffix = true;
1194
return;
1195
}
1196
1197
if (s != ThisTokEnd) {
1198
// Report an error if there are any.
1199
Diags.Report(Lexer::AdvanceToTokenCharacter(
1200
TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1201
diag::err_invalid_suffix_constant)
1202
<< StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1203
<< (isFixedPointConstant ? 2 : isFPConstant);
1204
hadError = true;
1205
}
1206
}
1207
1208
if (!hadError && saw_fixed_point_suffix) {
1209
assert(isFract || isAccum);
1210
}
1211
}
1212
1213
/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1214
/// numbers. It issues an error for illegal digits, and handles floating point
1215
/// parsing. If it detects a floating point number, the radix is set to 10.
1216
void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1217
assert((radix == 8 || radix == 10) && "Unexpected radix");
1218
1219
// If we have a hex digit other than 'e' (which denotes a FP exponent) then
1220
// the code is using an incorrect base.
1221
if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1222
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1223
Diags.Report(
1224
Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1225
diag::err_invalid_digit)
1226
<< StringRef(s, 1) << (radix == 8 ? 1 : 0);
1227
hadError = true;
1228
return;
1229
}
1230
1231
if (*s == '.') {
1232
checkSeparator(TokLoc, s, CSK_AfterDigits);
1233
s++;
1234
radix = 10;
1235
saw_period = true;
1236
checkSeparator(TokLoc, s, CSK_BeforeDigits);
1237
s = SkipDigits(s); // Skip suffix.
1238
}
1239
if (*s == 'e' || *s == 'E') { // exponent
1240
checkSeparator(TokLoc, s, CSK_AfterDigits);
1241
const char *Exponent = s;
1242
s++;
1243
radix = 10;
1244
saw_exponent = true;
1245
if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
1246
const char *first_non_digit = SkipDigits(s);
1247
if (containsDigits(s, first_non_digit)) {
1248
checkSeparator(TokLoc, s, CSK_BeforeDigits);
1249
s = first_non_digit;
1250
} else {
1251
if (!hadError) {
1252
Diags.Report(Lexer::AdvanceToTokenCharacter(
1253
TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1254
diag::err_exponent_has_no_digits);
1255
hadError = true;
1256
}
1257
return;
1258
}
1259
}
1260
}
1261
1262
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1263
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1264
/// treat it as an invalid suffix.
1265
bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1266
StringRef Suffix) {
1267
if (!LangOpts.CPlusPlus11 || Suffix.empty())
1268
return false;
1269
1270
// By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1271
// Suffixes starting with '__' (double underscore) are for use by
1272
// the implementation.
1273
if (Suffix.starts_with("_") && !Suffix.starts_with("__"))
1274
return true;
1275
1276
// In C++11, there are no library suffixes.
1277
if (!LangOpts.CPlusPlus14)
1278
return false;
1279
1280
// In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1281
// Per tweaked N3660, "il", "i", and "if" are also used in the library.
1282
// In C++2a "d" and "y" are used in the library.
1283
return llvm::StringSwitch<bool>(Suffix)
1284
.Cases("h", "min", "s", true)
1285
.Cases("ms", "us", "ns", true)
1286
.Cases("il", "i", "if", true)
1287
.Cases("d", "y", LangOpts.CPlusPlus20)
1288
.Default(false);
1289
}
1290
1291
void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1292
const char *Pos,
1293
CheckSeparatorKind IsAfterDigits) {
1294
if (IsAfterDigits == CSK_AfterDigits) {
1295
if (Pos == ThisTokBegin)
1296
return;
1297
--Pos;
1298
} else if (Pos == ThisTokEnd)
1299
return;
1300
1301
if (isDigitSeparator(*Pos)) {
1302
Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1303
LangOpts),
1304
diag::err_digit_separator_not_between_digits)
1305
<< IsAfterDigits;
1306
hadError = true;
1307
}
1308
}
1309
1310
/// ParseNumberStartingWithZero - This method is called when the first character
1311
/// of the number is found to be a zero. This means it is either an octal
1312
/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1313
/// a floating point number (01239.123e4). Eat the prefix, determining the
1314
/// radix etc.
1315
void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1316
assert(s[0] == '0' && "Invalid method call");
1317
s++;
1318
1319
int c1 = s[0];
1320
1321
// Handle a hex number like 0x1234.
1322
if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1323
s++;
1324
assert(s < ThisTokEnd && "didn't maximally munch?");
1325
radix = 16;
1326
DigitsBegin = s;
1327
s = SkipHexDigits(s);
1328
bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1329
if (s == ThisTokEnd) {
1330
// Done.
1331
} else if (*s == '.') {
1332
s++;
1333
saw_period = true;
1334
const char *floatDigitsBegin = s;
1335
s = SkipHexDigits(s);
1336
if (containsDigits(floatDigitsBegin, s))
1337
HasSignificandDigits = true;
1338
if (HasSignificandDigits)
1339
checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1340
}
1341
1342
if (!HasSignificandDigits) {
1343
Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1344
LangOpts),
1345
diag::err_hex_constant_requires)
1346
<< LangOpts.CPlusPlus << 1;
1347
hadError = true;
1348
return;
1349
}
1350
1351
// A binary exponent can appear with or with a '.'. If dotted, the
1352
// binary exponent is required.
1353
if (*s == 'p' || *s == 'P') {
1354
checkSeparator(TokLoc, s, CSK_AfterDigits);
1355
const char *Exponent = s;
1356
s++;
1357
saw_exponent = true;
1358
if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
1359
const char *first_non_digit = SkipDigits(s);
1360
if (!containsDigits(s, first_non_digit)) {
1361
if (!hadError) {
1362
Diags.Report(Lexer::AdvanceToTokenCharacter(
1363
TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1364
diag::err_exponent_has_no_digits);
1365
hadError = true;
1366
}
1367
return;
1368
}
1369
checkSeparator(TokLoc, s, CSK_BeforeDigits);
1370
s = first_non_digit;
1371
1372
if (!LangOpts.HexFloats)
1373
Diags.Report(TokLoc, LangOpts.CPlusPlus
1374
? diag::ext_hex_literal_invalid
1375
: diag::ext_hex_constant_invalid);
1376
else if (LangOpts.CPlusPlus17)
1377
Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1378
} else if (saw_period) {
1379
Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1380
LangOpts),
1381
diag::err_hex_constant_requires)
1382
<< LangOpts.CPlusPlus << 0;
1383
hadError = true;
1384
}
1385
return;
1386
}
1387
1388
// Handle simple binary numbers 0b01010
1389
if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1390
// 0b101010 is a C++14 and C23 extension.
1391
unsigned DiagId;
1392
if (LangOpts.CPlusPlus14)
1393
DiagId = diag::warn_cxx11_compat_binary_literal;
1394
else if (LangOpts.C23)
1395
DiagId = diag::warn_c23_compat_binary_literal;
1396
else if (LangOpts.CPlusPlus)
1397
DiagId = diag::ext_binary_literal_cxx14;
1398
else
1399
DiagId = diag::ext_binary_literal;
1400
Diags.Report(TokLoc, DiagId);
1401
++s;
1402
assert(s < ThisTokEnd && "didn't maximally munch?");
1403
radix = 2;
1404
DigitsBegin = s;
1405
s = SkipBinaryDigits(s);
1406
if (s == ThisTokEnd) {
1407
// Done.
1408
} else if (isHexDigit(*s) &&
1409
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1410
Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1411
LangOpts),
1412
diag::err_invalid_digit)
1413
<< StringRef(s, 1) << 2;
1414
hadError = true;
1415
}
1416
// Other suffixes will be diagnosed by the caller.
1417
return;
1418
}
1419
1420
// For now, the radix is set to 8. If we discover that we have a
1421
// floating point constant, the radix will change to 10. Octal floating
1422
// point constants are not permitted (only decimal and hexadecimal).
1423
radix = 8;
1424
const char *PossibleNewDigitStart = s;
1425
s = SkipOctalDigits(s);
1426
// When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1427
// as the start of the digits. So if skipping octal digits does not skip
1428
// anything, we leave the digit start where it was.
1429
if (s != PossibleNewDigitStart)
1430
DigitsBegin = PossibleNewDigitStart;
1431
1432
if (s == ThisTokEnd)
1433
return; // Done, simple octal number like 01234
1434
1435
// If we have some other non-octal digit that *is* a decimal digit, see if
1436
// this is part of a floating point number like 094.123 or 09e1.
1437
if (isDigit(*s)) {
1438
const char *EndDecimal = SkipDigits(s);
1439
if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1440
s = EndDecimal;
1441
radix = 10;
1442
}
1443
}
1444
1445
ParseDecimalOrOctalCommon(TokLoc);
1446
}
1447
1448
static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1449
switch (Radix) {
1450
case 2:
1451
return NumDigits <= 64;
1452
case 8:
1453
return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1454
case 10:
1455
return NumDigits <= 19; // floor(log10(2^64))
1456
case 16:
1457
return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1458
default:
1459
llvm_unreachable("impossible Radix");
1460
}
1461
}
1462
1463
/// GetIntegerValue - Convert this numeric literal value to an APInt that
1464
/// matches Val's input width. If there is an overflow, set Val to the low bits
1465
/// of the result and return true. Otherwise, return false.
1466
bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1467
// Fast path: Compute a conservative bound on the maximum number of
1468
// bits per digit in this radix. If we can't possibly overflow a
1469
// uint64 based on that bound then do the simple conversion to
1470
// integer. This avoids the expensive overflow checking below, and
1471
// handles the common cases that matter (small decimal integers and
1472
// hex/octal values which don't overflow).
1473
const unsigned NumDigits = SuffixBegin - DigitsBegin;
1474
if (alwaysFitsInto64Bits(radix, NumDigits)) {
1475
uint64_t N = 0;
1476
for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1477
if (!isDigitSeparator(*Ptr))
1478
N = N * radix + llvm::hexDigitValue(*Ptr);
1479
1480
// This will truncate the value to Val's input width. Simply check
1481
// for overflow by comparing.
1482
Val = N;
1483
return Val.getZExtValue() != N;
1484
}
1485
1486
Val = 0;
1487
const char *Ptr = DigitsBegin;
1488
1489
llvm::APInt RadixVal(Val.getBitWidth(), radix);
1490
llvm::APInt CharVal(Val.getBitWidth(), 0);
1491
llvm::APInt OldVal = Val;
1492
1493
bool OverflowOccurred = false;
1494
while (Ptr < SuffixBegin) {
1495
if (isDigitSeparator(*Ptr)) {
1496
++Ptr;
1497
continue;
1498
}
1499
1500
unsigned C = llvm::hexDigitValue(*Ptr++);
1501
1502
// If this letter is out of bound for this radix, reject it.
1503
assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1504
1505
CharVal = C;
1506
1507
// Add the digit to the value in the appropriate radix. If adding in digits
1508
// made the value smaller, then this overflowed.
1509
OldVal = Val;
1510
1511
// Multiply by radix, did overflow occur on the multiply?
1512
Val *= RadixVal;
1513
OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1514
1515
// Add value, did overflow occur on the value?
1516
// (a + b) ult b <=> overflow
1517
Val += CharVal;
1518
OverflowOccurred |= Val.ult(CharVal);
1519
}
1520
return OverflowOccurred;
1521
}
1522
1523
llvm::APFloat::opStatus
1524
NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,
1525
llvm::RoundingMode RM) {
1526
using llvm::APFloat;
1527
1528
unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1529
1530
llvm::SmallString<16> Buffer;
1531
StringRef Str(ThisTokBegin, n);
1532
if (Str.contains('\'')) {
1533
Buffer.reserve(n);
1534
std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1535
&isDigitSeparator);
1536
Str = Buffer;
1537
}
1538
1539
auto StatusOrErr = Result.convertFromString(Str, RM);
1540
assert(StatusOrErr && "Invalid floating point representation");
1541
return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1542
: APFloat::opInvalidOp;
1543
}
1544
1545
static inline bool IsExponentPart(char c, bool isHex) {
1546
if (isHex)
1547
return c == 'p' || c == 'P';
1548
return c == 'e' || c == 'E';
1549
}
1550
1551
bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1552
assert(radix == 16 || radix == 10);
1553
1554
// Find how many digits are needed to store the whole literal.
1555
unsigned NumDigits = SuffixBegin - DigitsBegin;
1556
if (saw_period) --NumDigits;
1557
1558
// Initial scan of the exponent if it exists
1559
bool ExpOverflowOccurred = false;
1560
bool NegativeExponent = false;
1561
const char *ExponentBegin;
1562
uint64_t Exponent = 0;
1563
int64_t BaseShift = 0;
1564
if (saw_exponent) {
1565
const char *Ptr = DigitsBegin;
1566
1567
while (!IsExponentPart(*Ptr, radix == 16))
1568
++Ptr;
1569
ExponentBegin = Ptr;
1570
++Ptr;
1571
NegativeExponent = *Ptr == '-';
1572
if (NegativeExponent) ++Ptr;
1573
1574
unsigned NumExpDigits = SuffixBegin - Ptr;
1575
if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1576
llvm::StringRef ExpStr(Ptr, NumExpDigits);
1577
llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1578
Exponent = ExpInt.getZExtValue();
1579
} else {
1580
ExpOverflowOccurred = true;
1581
}
1582
1583
if (NegativeExponent) BaseShift -= Exponent;
1584
else BaseShift += Exponent;
1585
}
1586
1587
// Number of bits needed for decimal literal is
1588
// ceil(NumDigits * log2(10)) Integral part
1589
// + Scale Fractional part
1590
// + ceil(Exponent * log2(10)) Exponent
1591
// --------------------------------------------------
1592
// ceil((NumDigits + Exponent) * log2(10)) + Scale
1593
//
1594
// But for simplicity in handling integers, we can round up log2(10) to 4,
1595
// making:
1596
// 4 * (NumDigits + Exponent) + Scale
1597
//
1598
// Number of digits needed for hexadecimal literal is
1599
// 4 * NumDigits Integral part
1600
// + Scale Fractional part
1601
// + Exponent Exponent
1602
// --------------------------------------------------
1603
// (4 * NumDigits) + Scale + Exponent
1604
uint64_t NumBitsNeeded;
1605
if (radix == 10)
1606
NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1607
else
1608
NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1609
1610
if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1611
ExpOverflowOccurred = true;
1612
llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1613
1614
bool FoundDecimal = false;
1615
1616
int64_t FractBaseShift = 0;
1617
const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1618
for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1619
if (*Ptr == '.') {
1620
FoundDecimal = true;
1621
continue;
1622
}
1623
1624
// Normal reading of an integer
1625
unsigned C = llvm::hexDigitValue(*Ptr);
1626
assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1627
1628
Val *= radix;
1629
Val += C;
1630
1631
if (FoundDecimal)
1632
// Keep track of how much we will need to adjust this value by from the
1633
// number of digits past the radix point.
1634
--FractBaseShift;
1635
}
1636
1637
// For a radix of 16, we will be multiplying by 2 instead of 16.
1638
if (radix == 16) FractBaseShift *= 4;
1639
BaseShift += FractBaseShift;
1640
1641
Val <<= Scale;
1642
1643
uint64_t Base = (radix == 16) ? 2 : 10;
1644
if (BaseShift > 0) {
1645
for (int64_t i = 0; i < BaseShift; ++i) {
1646
Val *= Base;
1647
}
1648
} else if (BaseShift < 0) {
1649
for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1650
Val = Val.udiv(Base);
1651
}
1652
1653
bool IntOverflowOccurred = false;
1654
auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1655
if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1656
IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1657
StoreVal = Val.trunc(StoreVal.getBitWidth());
1658
} else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1659
IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1660
StoreVal = Val.zext(StoreVal.getBitWidth());
1661
} else {
1662
StoreVal = Val;
1663
}
1664
1665
return IntOverflowOccurred || ExpOverflowOccurred;
1666
}
1667
1668
/// \verbatim
1669
/// user-defined-character-literal: [C++11 lex.ext]
1670
/// character-literal ud-suffix
1671
/// ud-suffix:
1672
/// identifier
1673
/// character-literal: [C++11 lex.ccon]
1674
/// ' c-char-sequence '
1675
/// u' c-char-sequence '
1676
/// U' c-char-sequence '
1677
/// L' c-char-sequence '
1678
/// u8' c-char-sequence ' [C++1z lex.ccon]
1679
/// c-char-sequence:
1680
/// c-char
1681
/// c-char-sequence c-char
1682
/// c-char:
1683
/// any member of the source character set except the single-quote ',
1684
/// backslash \, or new-line character
1685
/// escape-sequence
1686
/// universal-character-name
1687
/// escape-sequence:
1688
/// simple-escape-sequence
1689
/// octal-escape-sequence
1690
/// hexadecimal-escape-sequence
1691
/// simple-escape-sequence:
1692
/// one of \' \" \? \\ \a \b \f \n \r \t \v
1693
/// octal-escape-sequence:
1694
/// \ octal-digit
1695
/// \ octal-digit octal-digit
1696
/// \ octal-digit octal-digit octal-digit
1697
/// hexadecimal-escape-sequence:
1698
/// \x hexadecimal-digit
1699
/// hexadecimal-escape-sequence hexadecimal-digit
1700
/// universal-character-name: [C++11 lex.charset]
1701
/// \u hex-quad
1702
/// \U hex-quad hex-quad
1703
/// hex-quad:
1704
/// hex-digit hex-digit hex-digit hex-digit
1705
/// \endverbatim
1706
///
1707
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1708
SourceLocation Loc, Preprocessor &PP,
1709
tok::TokenKind kind) {
1710
// At this point we know that the character matches the regex "(L|u|U)?'.*'".
1711
HadError = false;
1712
1713
Kind = kind;
1714
1715
const char *TokBegin = begin;
1716
1717
// Skip over wide character determinant.
1718
if (Kind != tok::char_constant)
1719
++begin;
1720
if (Kind == tok::utf8_char_constant)
1721
++begin;
1722
1723
// Skip over the entry quote.
1724
if (begin[0] != '\'') {
1725
PP.Diag(Loc, diag::err_lexing_char);
1726
HadError = true;
1727
return;
1728
}
1729
1730
++begin;
1731
1732
// Remove an optional ud-suffix.
1733
if (end[-1] != '\'') {
1734
const char *UDSuffixEnd = end;
1735
do {
1736
--end;
1737
} while (end[-1] != '\'');
1738
// FIXME: Don't bother with this if !tok.hasUCN().
1739
expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1740
UDSuffixOffset = end - TokBegin;
1741
}
1742
1743
// Trim the ending quote.
1744
assert(end != begin && "Invalid token lexed");
1745
--end;
1746
1747
// FIXME: The "Value" is an uint64_t so we can handle char literals of
1748
// up to 64-bits.
1749
// FIXME: This extensively assumes that 'char' is 8-bits.
1750
assert(PP.getTargetInfo().getCharWidth() == 8 &&
1751
"Assumes char is 8 bits");
1752
assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1753
(PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1754
"Assumes sizeof(int) on target is <= 64 and a multiple of char");
1755
assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1756
"Assumes sizeof(wchar) on target is <= 64");
1757
1758
SmallVector<uint32_t, 4> codepoint_buffer;
1759
codepoint_buffer.resize(end - begin);
1760
uint32_t *buffer_begin = &codepoint_buffer.front();
1761
uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1762
1763
// Unicode escapes representing characters that cannot be correctly
1764
// represented in a single code unit are disallowed in character literals
1765
// by this implementation.
1766
uint32_t largest_character_for_kind;
1767
if (tok::wide_char_constant == Kind) {
1768
largest_character_for_kind =
1769
0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1770
} else if (tok::utf8_char_constant == Kind) {
1771
largest_character_for_kind = 0x7F;
1772
} else if (tok::utf16_char_constant == Kind) {
1773
largest_character_for_kind = 0xFFFF;
1774
} else if (tok::utf32_char_constant == Kind) {
1775
largest_character_for_kind = 0x10FFFF;
1776
} else {
1777
largest_character_for_kind = 0x7Fu;
1778
}
1779
1780
while (begin != end) {
1781
// Is this a span of non-escape characters?
1782
if (begin[0] != '\\') {
1783
char const *start = begin;
1784
do {
1785
++begin;
1786
} while (begin != end && *begin != '\\');
1787
1788
char const *tmp_in_start = start;
1789
uint32_t *tmp_out_start = buffer_begin;
1790
llvm::ConversionResult res =
1791
llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1792
reinterpret_cast<llvm::UTF8 const *>(begin),
1793
&buffer_begin, buffer_end, llvm::strictConversion);
1794
if (res != llvm::conversionOK) {
1795
// If we see bad encoding for unprefixed character literals, warn and
1796
// simply copy the byte values, for compatibility with gcc and
1797
// older versions of clang.
1798
bool NoErrorOnBadEncoding = isOrdinary();
1799
unsigned Msg = diag::err_bad_character_encoding;
1800
if (NoErrorOnBadEncoding)
1801
Msg = diag::warn_bad_character_encoding;
1802
PP.Diag(Loc, Msg);
1803
if (NoErrorOnBadEncoding) {
1804
start = tmp_in_start;
1805
buffer_begin = tmp_out_start;
1806
for (; start != begin; ++start, ++buffer_begin)
1807
*buffer_begin = static_cast<uint8_t>(*start);
1808
} else {
1809
HadError = true;
1810
}
1811
} else {
1812
for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1813
if (*tmp_out_start > largest_character_for_kind) {
1814
HadError = true;
1815
PP.Diag(Loc, diag::err_character_too_large);
1816
}
1817
}
1818
}
1819
1820
continue;
1821
}
1822
// Is this a Universal Character Name escape?
1823
if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1824
unsigned short UcnLen = 0;
1825
if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1826
FullSourceLoc(Loc, PP.getSourceManager()),
1827
&PP.getDiagnostics(), PP.getLangOpts(), true)) {
1828
HadError = true;
1829
} else if (*buffer_begin > largest_character_for_kind) {
1830
HadError = true;
1831
PP.Diag(Loc, diag::err_character_too_large);
1832
}
1833
1834
++buffer_begin;
1835
continue;
1836
}
1837
unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1838
uint64_t result =
1839
ProcessCharEscape(TokBegin, begin, end, HadError,
1840
FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
1841
&PP.getDiagnostics(), PP.getLangOpts(),
1842
StringLiteralEvalMethod::Evaluated);
1843
*buffer_begin++ = result;
1844
}
1845
1846
unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1847
1848
if (NumCharsSoFar > 1) {
1849
if (isOrdinary() && NumCharsSoFar == 4)
1850
PP.Diag(Loc, diag::warn_four_char_character_literal);
1851
else if (isOrdinary())
1852
PP.Diag(Loc, diag::warn_multichar_character_literal);
1853
else {
1854
PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1855
HadError = true;
1856
}
1857
IsMultiChar = true;
1858
} else {
1859
IsMultiChar = false;
1860
}
1861
1862
llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1863
1864
// Narrow character literals act as though their value is concatenated
1865
// in this implementation, but warn on overflow.
1866
bool multi_char_too_long = false;
1867
if (isOrdinary() && isMultiChar()) {
1868
LitVal = 0;
1869
for (size_t i = 0; i < NumCharsSoFar; ++i) {
1870
// check for enough leading zeros to shift into
1871
multi_char_too_long |= (LitVal.countl_zero() < 8);
1872
LitVal <<= 8;
1873
LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1874
}
1875
} else if (NumCharsSoFar > 0) {
1876
// otherwise just take the last character
1877
LitVal = buffer_begin[-1];
1878
}
1879
1880
if (!HadError && multi_char_too_long) {
1881
PP.Diag(Loc, diag::warn_char_constant_too_large);
1882
}
1883
1884
// Transfer the value from APInt to uint64_t
1885
Value = LitVal.getZExtValue();
1886
1887
// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1888
// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1889
// character constants are not sign extended in the this implementation:
1890
// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1891
if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1892
PP.getLangOpts().CharIsSigned)
1893
Value = (signed char)Value;
1894
}
1895
1896
/// \verbatim
1897
/// string-literal: [C++0x lex.string]
1898
/// encoding-prefix " [s-char-sequence] "
1899
/// encoding-prefix R raw-string
1900
/// encoding-prefix:
1901
/// u8
1902
/// u
1903
/// U
1904
/// L
1905
/// s-char-sequence:
1906
/// s-char
1907
/// s-char-sequence s-char
1908
/// s-char:
1909
/// any member of the source character set except the double-quote ",
1910
/// backslash \, or new-line character
1911
/// escape-sequence
1912
/// universal-character-name
1913
/// raw-string:
1914
/// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1915
/// r-char-sequence:
1916
/// r-char
1917
/// r-char-sequence r-char
1918
/// r-char:
1919
/// any member of the source character set, except a right parenthesis )
1920
/// followed by the initial d-char-sequence (which may be empty)
1921
/// followed by a double quote ".
1922
/// d-char-sequence:
1923
/// d-char
1924
/// d-char-sequence d-char
1925
/// d-char:
1926
/// any member of the basic source character set except:
1927
/// space, the left parenthesis (, the right parenthesis ),
1928
/// the backslash \, and the control characters representing horizontal
1929
/// tab, vertical tab, form feed, and newline.
1930
/// escape-sequence: [C++0x lex.ccon]
1931
/// simple-escape-sequence
1932
/// octal-escape-sequence
1933
/// hexadecimal-escape-sequence
1934
/// simple-escape-sequence:
1935
/// one of \' \" \? \\ \a \b \f \n \r \t \v
1936
/// octal-escape-sequence:
1937
/// \ octal-digit
1938
/// \ octal-digit octal-digit
1939
/// \ octal-digit octal-digit octal-digit
1940
/// hexadecimal-escape-sequence:
1941
/// \x hexadecimal-digit
1942
/// hexadecimal-escape-sequence hexadecimal-digit
1943
/// universal-character-name:
1944
/// \u hex-quad
1945
/// \U hex-quad hex-quad
1946
/// hex-quad:
1947
/// hex-digit hex-digit hex-digit hex-digit
1948
/// \endverbatim
1949
///
1950
StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1951
Preprocessor &PP,
1952
StringLiteralEvalMethod EvalMethod)
1953
: SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1954
Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1955
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1956
ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1957
Pascal(false) {
1958
init(StringToks);
1959
}
1960
1961
void StringLiteralParser::init(ArrayRef<Token> StringToks){
1962
// The literal token may have come from an invalid source location (e.g. due
1963
// to a PCH error), in which case the token length will be 0.
1964
if (StringToks.empty() || StringToks[0].getLength() < 2)
1965
return DiagnoseLexingError(SourceLocation());
1966
1967
// Scan all of the string portions, remember the max individual token length,
1968
// computing a bound on the concatenated string length, and see whether any
1969
// piece is a wide-string. If any of the string portions is a wide-string
1970
// literal, the result is a wide-string literal [C99 6.4.5p4].
1971
assert(!StringToks.empty() && "expected at least one token");
1972
MaxTokenLength = StringToks[0].getLength();
1973
assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1974
SizeBound = StringToks[0].getLength() - 2; // -2 for "".
1975
hadError = false;
1976
1977
// Determines the kind of string from the prefix
1978
Kind = tok::string_literal;
1979
1980
/// (C99 5.1.1.2p1). The common case is only one string fragment.
1981
for (const Token &Tok : StringToks) {
1982
if (Tok.getLength() < 2)
1983
return DiagnoseLexingError(Tok.getLocation());
1984
1985
// The string could be shorter than this if it needs cleaning, but this is a
1986
// reasonable bound, which is all we need.
1987
assert(Tok.getLength() >= 2 && "literal token is invalid!");
1988
SizeBound += Tok.getLength() - 2; // -2 for "".
1989
1990
// Remember maximum string piece length.
1991
if (Tok.getLength() > MaxTokenLength)
1992
MaxTokenLength = Tok.getLength();
1993
1994
// Remember if we see any wide or utf-8/16/32 strings.
1995
// Also check for illegal concatenations.
1996
if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
1997
if (Diags) {
1998
SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
1999
Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
2000
Features);
2001
CharSourceRange Range =
2002
CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
2003
StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
2004
getEncodingPrefixLen(Tok.getKind()));
2005
Diags->Report(Tok.getLocation(),
2006
Features.CPlusPlus26
2007
? diag::err_unevaluated_string_prefix
2008
: diag::warn_unevaluated_string_prefix)
2009
<< Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
2010
}
2011
if (Features.CPlusPlus26)
2012
hadError = true;
2013
} else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
2014
if (isOrdinary()) {
2015
Kind = Tok.getKind();
2016
} else {
2017
if (Diags)
2018
Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
2019
hadError = true;
2020
}
2021
}
2022
}
2023
2024
// Include space for the null terminator.
2025
++SizeBound;
2026
2027
// TODO: K&R warning: "traditional C rejects string constant concatenation"
2028
2029
// Get the width in bytes of char/wchar_t/char16_t/char32_t
2030
CharByteWidth = getCharWidth(Kind, Target);
2031
assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
2032
CharByteWidth /= 8;
2033
2034
// The output buffer size needs to be large enough to hold wide characters.
2035
// This is a worst-case assumption which basically corresponds to L"" "long".
2036
SizeBound *= CharByteWidth;
2037
2038
// Size the temporary buffer to hold the result string data.
2039
ResultBuf.resize(SizeBound);
2040
2041
// Likewise, but for each string piece.
2042
SmallString<512> TokenBuf;
2043
TokenBuf.resize(MaxTokenLength);
2044
2045
// Loop over all the strings, getting their spelling, and expanding them to
2046
// wide strings as appropriate.
2047
ResultPtr = &ResultBuf[0]; // Next byte to fill in.
2048
2049
Pascal = false;
2050
2051
SourceLocation UDSuffixTokLoc;
2052
2053
for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
2054
const char *ThisTokBuf = &TokenBuf[0];
2055
// Get the spelling of the token, which eliminates trigraphs, etc. We know
2056
// that ThisTokBuf points to a buffer that is big enough for the whole token
2057
// and 'spelled' tokens can only shrink.
2058
bool StringInvalid = false;
2059
unsigned ThisTokLen =
2060
Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
2061
&StringInvalid);
2062
if (StringInvalid)
2063
return DiagnoseLexingError(StringToks[i].getLocation());
2064
2065
const char *ThisTokBegin = ThisTokBuf;
2066
const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2067
2068
// Remove an optional ud-suffix.
2069
if (ThisTokEnd[-1] != '"') {
2070
const char *UDSuffixEnd = ThisTokEnd;
2071
do {
2072
--ThisTokEnd;
2073
} while (ThisTokEnd[-1] != '"');
2074
2075
StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2076
2077
if (UDSuffixBuf.empty()) {
2078
if (StringToks[i].hasUCN())
2079
expandUCNs(UDSuffixBuf, UDSuffix);
2080
else
2081
UDSuffixBuf.assign(UDSuffix);
2082
UDSuffixToken = i;
2083
UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2084
UDSuffixTokLoc = StringToks[i].getLocation();
2085
} else {
2086
SmallString<32> ExpandedUDSuffix;
2087
if (StringToks[i].hasUCN()) {
2088
expandUCNs(ExpandedUDSuffix, UDSuffix);
2089
UDSuffix = ExpandedUDSuffix;
2090
}
2091
2092
// C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2093
// result of a concatenation involving at least one user-defined-string-
2094
// literal, all the participating user-defined-string-literals shall
2095
// have the same ud-suffix.
2096
bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2097
if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
2098
if (Diags) {
2099
SourceLocation TokLoc = StringToks[i].getLocation();
2100
if (UnevaluatedStringHasUDL) {
2101
Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
2102
<< SourceRange(TokLoc, TokLoc);
2103
} else {
2104
Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
2105
<< UDSuffixBuf << UDSuffix
2106
<< SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
2107
}
2108
}
2109
hadError = true;
2110
}
2111
}
2112
}
2113
2114
// Strip the end quote.
2115
--ThisTokEnd;
2116
2117
// TODO: Input character set mapping support.
2118
2119
// Skip marker for wide or unicode strings.
2120
if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
2121
++ThisTokBuf;
2122
// Skip 8 of u8 marker for utf8 strings.
2123
if (ThisTokBuf[0] == '8')
2124
++ThisTokBuf;
2125
}
2126
2127
// Check for raw string
2128
if (ThisTokBuf[0] == 'R') {
2129
if (ThisTokBuf[1] != '"') {
2130
// The file may have come from PCH and then changed after loading the
2131
// PCH; Fail gracefully.
2132
return DiagnoseLexingError(StringToks[i].getLocation());
2133
}
2134
ThisTokBuf += 2; // skip R"
2135
2136
// C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2137
// characters.
2138
constexpr unsigned MaxRawStrDelimLen = 16;
2139
2140
const char *Prefix = ThisTokBuf;
2141
while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2142
ThisTokBuf[0] != '(')
2143
++ThisTokBuf;
2144
if (ThisTokBuf[0] != '(')
2145
return DiagnoseLexingError(StringToks[i].getLocation());
2146
++ThisTokBuf; // skip '('
2147
2148
// Remove same number of characters from the end
2149
ThisTokEnd -= ThisTokBuf - Prefix;
2150
if (ThisTokEnd < ThisTokBuf)
2151
return DiagnoseLexingError(StringToks[i].getLocation());
2152
2153
// C++14 [lex.string]p4: A source-file new-line in a raw string literal
2154
// results in a new-line in the resulting execution string-literal.
2155
StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2156
while (!RemainingTokenSpan.empty()) {
2157
// Split the string literal on \r\n boundaries.
2158
size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2159
StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2160
StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2161
2162
// Copy everything before the \r\n sequence into the string literal.
2163
if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2164
hadError = true;
2165
2166
// Point into the \n inside the \r\n sequence and operate on the
2167
// remaining portion of the literal.
2168
RemainingTokenSpan = AfterCRLF.substr(1);
2169
}
2170
} else {
2171
if (ThisTokBuf[0] != '"') {
2172
// The file may have come from PCH and then changed after loading the
2173
// PCH; Fail gracefully.
2174
return DiagnoseLexingError(StringToks[i].getLocation());
2175
}
2176
++ThisTokBuf; // skip "
2177
2178
// Check if this is a pascal string
2179
if (!isUnevaluated() && Features.PascalStrings &&
2180
ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
2181
ThisTokBuf[1] == 'p') {
2182
2183
// If the \p sequence is found in the first token, we have a pascal string
2184
// Otherwise, if we already have a pascal string, ignore the first \p
2185
if (i == 0) {
2186
++ThisTokBuf;
2187
Pascal = true;
2188
} else if (Pascal)
2189
ThisTokBuf += 2;
2190
}
2191
2192
while (ThisTokBuf != ThisTokEnd) {
2193
// Is this a span of non-escape characters?
2194
if (ThisTokBuf[0] != '\\') {
2195
const char *InStart = ThisTokBuf;
2196
do {
2197
++ThisTokBuf;
2198
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2199
2200
// Copy the character span over.
2201
if (CopyStringFragment(StringToks[i], ThisTokBegin,
2202
StringRef(InStart, ThisTokBuf - InStart)))
2203
hadError = true;
2204
continue;
2205
}
2206
// Is this a Universal Character Name escape?
2207
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2208
ThisTokBuf[1] == 'N') {
2209
EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2210
ResultPtr, hadError,
2211
FullSourceLoc(StringToks[i].getLocation(), SM),
2212
CharByteWidth, Diags, Features);
2213
continue;
2214
}
2215
// Otherwise, this is a non-UCN escape character. Process it.
2216
unsigned ResultChar =
2217
ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2218
FullSourceLoc(StringToks[i].getLocation(), SM),
2219
CharByteWidth * 8, Diags, Features, EvalMethod);
2220
2221
if (CharByteWidth == 4) {
2222
// FIXME: Make the type of the result buffer correct instead of
2223
// using reinterpret_cast.
2224
llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2225
*ResultWidePtr = ResultChar;
2226
ResultPtr += 4;
2227
} else if (CharByteWidth == 2) {
2228
// FIXME: Make the type of the result buffer correct instead of
2229
// using reinterpret_cast.
2230
llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2231
*ResultWidePtr = ResultChar & 0xFFFF;
2232
ResultPtr += 2;
2233
} else {
2234
assert(CharByteWidth == 1 && "Unexpected char width");
2235
*ResultPtr++ = ResultChar & 0xFF;
2236
}
2237
}
2238
}
2239
}
2240
2241
assert((!Pascal || !isUnevaluated()) &&
2242
"Pascal string in unevaluated context");
2243
if (Pascal) {
2244
if (CharByteWidth == 4) {
2245
// FIXME: Make the type of the result buffer correct instead of
2246
// using reinterpret_cast.
2247
llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2248
ResultWidePtr[0] = GetNumStringChars() - 1;
2249
} else if (CharByteWidth == 2) {
2250
// FIXME: Make the type of the result buffer correct instead of
2251
// using reinterpret_cast.
2252
llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2253
ResultWidePtr[0] = GetNumStringChars() - 1;
2254
} else {
2255
assert(CharByteWidth == 1 && "Unexpected char width");
2256
ResultBuf[0] = GetNumStringChars() - 1;
2257
}
2258
2259
// Verify that pascal strings aren't too large.
2260
if (GetStringLength() > 256) {
2261
if (Diags)
2262
Diags->Report(StringToks.front().getLocation(),
2263
diag::err_pascal_string_too_long)
2264
<< SourceRange(StringToks.front().getLocation(),
2265
StringToks.back().getLocation());
2266
hadError = true;
2267
return;
2268
}
2269
} else if (Diags) {
2270
// Complain if this string literal has too many characters.
2271
unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2272
2273
if (GetNumStringChars() > MaxChars)
2274
Diags->Report(StringToks.front().getLocation(),
2275
diag::ext_string_too_long)
2276
<< GetNumStringChars() << MaxChars
2277
<< (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2278
<< SourceRange(StringToks.front().getLocation(),
2279
StringToks.back().getLocation());
2280
}
2281
}
2282
2283
static const char *resyncUTF8(const char *Err, const char *End) {
2284
if (Err == End)
2285
return End;
2286
End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2287
while (++Err != End && (*Err & 0xC0) == 0x80)
2288
;
2289
return Err;
2290
}
2291
2292
/// This function copies from Fragment, which is a sequence of bytes
2293
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
2294
/// Performs widening for multi-byte characters.
2295
bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2296
const char *TokBegin,
2297
StringRef Fragment) {
2298
const llvm::UTF8 *ErrorPtrTmp;
2299
if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2300
return false;
2301
2302
// If we see bad encoding for unprefixed string literals, warn and
2303
// simply copy the byte values, for compatibility with gcc and older
2304
// versions of clang.
2305
bool NoErrorOnBadEncoding = isOrdinary();
2306
if (NoErrorOnBadEncoding) {
2307
memcpy(ResultPtr, Fragment.data(), Fragment.size());
2308
ResultPtr += Fragment.size();
2309
}
2310
2311
if (Diags) {
2312
const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2313
2314
FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2315
const DiagnosticBuilder &Builder =
2316
Diag(Diags, Features, SourceLoc, TokBegin,
2317
ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2318
NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2319
: diag::err_bad_string_encoding);
2320
2321
const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2322
StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2323
2324
// Decode into a dummy buffer.
2325
SmallString<512> Dummy;
2326
Dummy.reserve(Fragment.size() * CharByteWidth);
2327
char *Ptr = Dummy.data();
2328
2329
while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2330
const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2331
NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2332
Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2333
ErrorPtr, NextStart);
2334
NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2335
}
2336
}
2337
return !NoErrorOnBadEncoding;
2338
}
2339
2340
void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2341
hadError = true;
2342
if (Diags)
2343
Diags->Report(Loc, diag::err_lexing_string);
2344
}
2345
2346
/// getOffsetOfStringByte - This function returns the offset of the
2347
/// specified byte of the string data represented by Token. This handles
2348
/// advancing over escape sequences in the string.
2349
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2350
unsigned ByteNo) const {
2351
// Get the spelling of the token.
2352
SmallString<32> SpellingBuffer;
2353
SpellingBuffer.resize(Tok.getLength());
2354
2355
bool StringInvalid = false;
2356
const char *SpellingPtr = &SpellingBuffer[0];
2357
unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2358
&StringInvalid);
2359
if (StringInvalid)
2360
return 0;
2361
2362
const char *SpellingStart = SpellingPtr;
2363
const char *SpellingEnd = SpellingPtr+TokLen;
2364
2365
// Handle UTF-8 strings just like narrow strings.
2366
if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2367
SpellingPtr += 2;
2368
2369
assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2370
SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2371
2372
// For raw string literals, this is easy.
2373
if (SpellingPtr[0] == 'R') {
2374
assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2375
// Skip 'R"'.
2376
SpellingPtr += 2;
2377
while (*SpellingPtr != '(') {
2378
++SpellingPtr;
2379
assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2380
}
2381
// Skip '('.
2382
++SpellingPtr;
2383
return SpellingPtr - SpellingStart + ByteNo;
2384
}
2385
2386
// Skip over the leading quote
2387
assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2388
++SpellingPtr;
2389
2390
// Skip over bytes until we find the offset we're looking for.
2391
while (ByteNo) {
2392
assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2393
2394
// Step over non-escapes simply.
2395
if (*SpellingPtr != '\\') {
2396
++SpellingPtr;
2397
--ByteNo;
2398
continue;
2399
}
2400
2401
// Otherwise, this is an escape character. Advance over it.
2402
bool HadError = false;
2403
if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2404
SpellingPtr[1] == 'N') {
2405
const char *EscapePtr = SpellingPtr;
2406
unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2407
1, Features, HadError);
2408
if (Len > ByteNo) {
2409
// ByteNo is somewhere within the escape sequence.
2410
SpellingPtr = EscapePtr;
2411
break;
2412
}
2413
ByteNo -= Len;
2414
} else {
2415
ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2416
FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
2417
Diags, Features, StringLiteralEvalMethod::Evaluated);
2418
--ByteNo;
2419
}
2420
assert(!HadError && "This method isn't valid on erroneous strings");
2421
}
2422
2423
return SpellingPtr-SpellingStart;
2424
}
2425
2426
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2427
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
2428
/// treat it as an invalid suffix.
2429
bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2430
StringRef Suffix) {
2431
return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2432
Suffix == "sv";
2433
}
2434
2435