CoCalc -- gdscript_tokenizer.cpp

GitHub Repository: godotengine/godot
Path: blob/master/modules/gdscript/gdscript_tokenizer.cpp
²⁰⁸⁹⁶ views
1
/**************************************************************************/
2
/*  gdscript_tokenizer.cpp                                                */
3
/**************************************************************************/
4
/*                         This file is part of:                          */
5
/*                             GODOT ENGINE                               */
6
/*                        https://godotengine.org                         */
7
/**************************************************************************/
8
/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
9
/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
10
/*                                                                        */
11
/* Permission is hereby granted, free of charge, to any person obtaining  */
12
/* a copy of this software and associated documentation files (the        */
13
/* "Software"), to deal in the Software without restriction, including    */
14
/* without limitation the rights to use, copy, modify, merge, publish,    */
15
/* distribute, sublicense, and/or sell copies of the Software, and to     */
16
/* permit persons to whom the Software is furnished to do so, subject to  */
17
/* the following conditions:                                              */
18
/*                                                                        */
19
/* The above copyright notice and this permission notice shall be         */
20
/* included in all copies or substantial portions of the Software.        */
21
/*                                                                        */
22
/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
23
/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
24
/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
25
/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
26
/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
27
/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
28
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
29
/**************************************************************************/
30

31
#include "gdscript_tokenizer.h"
32

33
#include "core/error/error_macros.h"
34
#include "core/string/char_utils.h"
35

36
#ifdef DEBUG_ENABLED
37
#include "servers/text/text_server.h"
38
#endif
39

40
#ifdef TOOLS_ENABLED
41
#include "editor/settings/editor_settings.h"
42
#endif
43

44
static const char *token_names[] = {
45
	"Empty", // EMPTY,
46
	// Basic
47
	"Annotation", // ANNOTATION
48
	"Identifier", // IDENTIFIER,
49
	"Literal", // LITERAL,
50
	// Comparison
51
	"<", // LESS,
52
	"<=", // LESS_EQUAL,
53
	">", // GREATER,
54
	">=", // GREATER_EQUAL,
55
	"==", // EQUAL_EQUAL,
56
	"!=", // BANG_EQUAL,
57
	// Logical
58
	"and", // AND,
59
	"or", // OR,
60
	"not", // NOT,
61
	"&&", // AMPERSAND_AMPERSAND,
62
	"||", // PIPE_PIPE,
63
	"!", // BANG,
64
	// Bitwise
65
	"&", // AMPERSAND,
66
	"|", // PIPE,
67
	"~", // TILDE,
68
	"^", // CARET,
69
	"<<", // LESS_LESS,
70
	">>", // GREATER_GREATER,
71
	// Math
72
	"+", // PLUS,
73
	"-", // MINUS,
74
	"*", // STAR,
75
	"**", // STAR_STAR,
76
	"/", // SLASH,
77
	"%", // PERCENT,
78
	// Assignment
79
	"=", // EQUAL,
80
	"+=", // PLUS_EQUAL,
81
	"-=", // MINUS_EQUAL,
82
	"*=", // STAR_EQUAL,
83
	"**=", // STAR_STAR_EQUAL,
84
	"/=", // SLASH_EQUAL,
85
	"%=", // PERCENT_EQUAL,
86
	"<<=", // LESS_LESS_EQUAL,
87
	">>=", // GREATER_GREATER_EQUAL,
88
	"&=", // AMPERSAND_EQUAL,
89
	"|=", // PIPE_EQUAL,
90
	"^=", // CARET_EQUAL,
91
	// Control flow
92
	"if", // IF,
93
	"elif", // ELIF,
94
	"else", // ELSE,
95
	"for", // FOR,
96
	"while", // WHILE,
97
	"break", // BREAK,
98
	"continue", // CONTINUE,
99
	"pass", // PASS,
100
	"return", // RETURN,
101
	"match", // MATCH,
102
	"when", // WHEN,
103
	// Keywords
104
	"as", // AS,
105
	"assert", // ASSERT,
106
	"await", // AWAIT,
107
	"breakpoint", // BREAKPOINT,
108
	"class", // CLASS,
109
	"class_name", // CLASS_NAME,
110
	"const", // TK_CONST,
111
	"enum", // ENUM,
112
	"extends", // EXTENDS,
113
	"func", // FUNC,
114
	"in", // TK_IN,
115
	"is", // IS,
116
	"namespace", // NAMESPACE
117
	"preload", // PRELOAD,
118
	"self", // SELF,
119
	"signal", // SIGNAL,
120
	"static", // STATIC,
121
	"super", // SUPER,
122
	"trait", // TRAIT,
123
	"var", // VAR,
124
	"void", // TK_VOID,
125
	"yield", // YIELD,
126
	// Punctuation
127
	"[", // BRACKET_OPEN,
128
	"]", // BRACKET_CLOSE,
129
	"{", // BRACE_OPEN,
130
	"}", // BRACE_CLOSE,
131
	"(", // PARENTHESIS_OPEN,
132
	")", // PARENTHESIS_CLOSE,
133
	",", // COMMA,
134
	";", // SEMICOLON,
135
	".", // PERIOD,
136
	"..", // PERIOD_PERIOD,
137
	"...", // PERIOD_PERIOD_PERIOD,
138
	":", // COLON,
139
	"$", // DOLLAR,
140
	"->", // FORWARD_ARROW,
141
	"_", // UNDERSCORE,
142
	// Whitespace
143
	"Newline", // NEWLINE,
144
	"Indent", // INDENT,
145
	"Dedent", // DEDENT,
146
	// Constants
147
	"PI", // CONST_PI,
148
	"TAU", // CONST_TAU,
149
	"INF", // CONST_INF,
150
	"NaN", // CONST_NAN,
151
	// Error message improvement
152
	"VCS conflict marker", // VCS_CONFLICT_MARKER,
153
	"`", // BACKTICK,
154
	"?", // QUESTION_MARK,
155
	// Special
156
	"Error", // ERROR,
157
	"End of file", // EOF,
158
};
159

160
// Avoid desync.
161
static_assert(std_size(token_names) == GDScriptTokenizer::Token::TK_MAX, "Amount of token names don't match the amount of token types.");
162

163
const char *GDScriptTokenizer::Token::get_name() const {
164
	ERR_FAIL_INDEX_V_MSG(type, TK_MAX, "<error>", "Using token type out of the enum.");
165
	return token_names[type];
166
}
167

168
String GDScriptTokenizer::Token::get_debug_name() const {
169
	switch (type) {
170
		case IDENTIFIER:
171
			return vformat(R"(identifier "%s")", source);
172
		default:
173
			return vformat(R"("%s")", get_name());
174
	}
175
}
176

177
bool GDScriptTokenizer::Token::can_precede_bin_op() const {
178
	switch (type) {
179
		case IDENTIFIER:
180
		case LITERAL:
181
		case SELF:
182
		case BRACKET_CLOSE:
183
		case BRACE_CLOSE:
184
		case PARENTHESIS_CLOSE:
185
		case CONST_PI:
186
		case CONST_TAU:
187
		case CONST_INF:
188
		case CONST_NAN:
189
			return true;
190
		default:
191
			return false;
192
	}
193
}
194

195
bool GDScriptTokenizer::Token::is_identifier() const {
196
	// Note: Most keywords should not be recognized as identifiers.
197
	// These are only exceptions for stuff that already is on the engine's API.
198
	switch (type) {
199
		case IDENTIFIER:
200
		case MATCH: // Used in String.match().
201
		case WHEN: // New keyword, avoid breaking existing code.
202
		// Allow constants to be treated as regular identifiers.
203
		case CONST_PI:
204
		case CONST_INF:
205
		case CONST_NAN:
206
		case CONST_TAU:
207
			return true;
208
		default:
209
			return false;
210
	}
211
}
212

213
bool GDScriptTokenizer::Token::is_node_name() const {
214
	// This is meant to allow keywords with the $ notation, but not as general identifiers.
215
	switch (type) {
216
		case IDENTIFIER:
217
		case AND:
218
		case AS:
219
		case ASSERT:
220
		case AWAIT:
221
		case BREAK:
222
		case BREAKPOINT:
223
		case CLASS_NAME:
224
		case CLASS:
225
		case TK_CONST:
226
		case CONST_PI:
227
		case CONST_INF:
228
		case CONST_NAN:
229
		case CONST_TAU:
230
		case CONTINUE:
231
		case ELIF:
232
		case ELSE:
233
		case ENUM:
234
		case EXTENDS:
235
		case FOR:
236
		case FUNC:
237
		case IF:
238
		case TK_IN:
239
		case IS:
240
		case MATCH:
241
		case NAMESPACE:
242
		case NOT:
243
		case OR:
244
		case PASS:
245
		case PRELOAD:
246
		case RETURN:
247
		case SELF:
248
		case SIGNAL:
249
		case STATIC:
250
		case SUPER:
251
		case TRAIT:
252
		case UNDERSCORE:
253
		case VAR:
254
		case TK_VOID:
255
		case WHILE:
256
		case WHEN:
257
		case YIELD:
258
			return true;
259
		default:
260
			return false;
261
	}
262
}
263

264
String GDScriptTokenizer::get_token_name(Token::Type p_token_type) {
265
	ERR_FAIL_INDEX_V_MSG(p_token_type, Token::TK_MAX, "<error>", "Using token type out of the enum.");
266
	return token_names[p_token_type];
267
}
268

269
void GDScriptTokenizerText::set_source_code(const String &p_source_code) {
270
	source = p_source_code;
271
	_source = source.get_data();
272
	_current = _source;
273
	_start = _source;
274
	line = 1;
275
	column = 1;
276
	length = p_source_code.length();
277
	position = 0;
278
}
279

280
void GDScriptTokenizerText::set_cursor_position(int p_line, int p_column) {
281
	cursor_line = p_line;
282
	cursor_column = p_column;
283
}
284

285
void GDScriptTokenizerText::set_multiline_mode(bool p_state) {
286
	multiline_mode = p_state;
287
}
288

289
void GDScriptTokenizerText::push_expression_indented_block() {
290
	indent_stack_stack.push_back(indent_stack);
291
}
292

293
void GDScriptTokenizerText::pop_expression_indented_block() {
294
	ERR_FAIL_COND(indent_stack_stack.is_empty());
295
	indent_stack = indent_stack_stack.back()->get();
296
	indent_stack_stack.pop_back();
297
}
298

299
int GDScriptTokenizerText::get_cursor_line() const {
300
	return cursor_line;
301
}
302

303
int GDScriptTokenizerText::get_cursor_column() const {
304
	return cursor_column;
305
}
306

307
bool GDScriptTokenizerText::is_past_cursor() const {
308
	if (line < cursor_line) {
309
		return false;
310
	}
311
	if (line > cursor_line) {
312
		return true;
313
	}
314
	if (column < cursor_column) {
315
		return false;
316
	}
317
	return true;
318
}
319

320
char32_t GDScriptTokenizerText::_advance() {
321
	if (unlikely(_is_at_end())) {
322
		return '\0';
323
	}
324
	_current++;
325
	column++;
326
	position++;
327
	if (unlikely(_is_at_end())) {
328
		// Add extra newline even if it's not there, to satisfy the parser.
329
		newline(true);
330
		// Also add needed unindent.
331
		check_indent();
332
	}
333
	return _peek(-1);
334
}
335

336
void GDScriptTokenizerText::push_paren(char32_t p_char) {
337
	paren_stack.push_back(p_char);
338
}
339

340
bool GDScriptTokenizerText::pop_paren(char32_t p_expected) {
341
	if (paren_stack.is_empty()) {
342
		return false;
343
	}
344
	char32_t actual = paren_stack.back()->get();
345
	paren_stack.pop_back();
346

347
	return actual == p_expected;
348
}
349

350
GDScriptTokenizer::Token GDScriptTokenizerText::pop_error() {
351
	Token error = error_stack.back()->get();
352
	error_stack.pop_back();
353
	return error;
354
}
355

356
GDScriptTokenizer::Token GDScriptTokenizerText::make_token(Token::Type p_type) {
357
	Token token(p_type);
358
	token.start_line = start_line;
359
	token.end_line = line;
360
	token.start_column = start_column;
361
	token.end_column = column;
362
	token.source = String::utf32(Span(_start, _current - _start));
363

364
	if (p_type != Token::ERROR && cursor_line > -1) {
365
		// Also count whitespace after token.
366
		int offset = 0;
367
		while (_peek(offset) == ' ' || _peek(offset) == '\t') {
368
			offset++;
369
		}
370
		int last_column = column + offset;
371
		// Check cursor position in token.
372
		if (start_line == line) {
373
			// Single line token.
374
			if (cursor_line == start_line && cursor_column >= start_column && cursor_column <= last_column) {
375
				if (cursor_column == start_column) {
376
					token.cursor_place = CURSOR_BEGINNING;
377
				} else if (cursor_column < column) {
378
					token.cursor_place = CURSOR_MIDDLE;
379
				} else {
380
					token.cursor_place = CURSOR_END;
381
				}
382
			}
383
		} else {
384
			// Multi line token.
385
			if (cursor_line == start_line && cursor_column >= start_column) {
386
				// Is in first line.
387
				if (cursor_column == start_column) {
388
					token.cursor_place = CURSOR_BEGINNING;
389
				} else {
390
					token.cursor_place = CURSOR_MIDDLE;
391
				}
392
			} else if (cursor_line == line && cursor_column <= last_column) {
393
				// Is in last line.
394
				if (cursor_column < column) {
395
					token.cursor_place = CURSOR_MIDDLE;
396
				} else {
397
					token.cursor_place = CURSOR_END;
398
				}
399
			} else if (cursor_line > start_line && cursor_line < line) {
400
				// Is in middle line.
401
				token.cursor_place = CURSOR_MIDDLE;
402
			}
403
		}
404
	}
405

406
	last_token = token;
407
	return token;
408
}
409

410
GDScriptTokenizer::Token GDScriptTokenizerText::make_literal(const Variant &p_literal) {
411
	Token token = make_token(Token::LITERAL);
412
	token.literal = p_literal;
413
	return token;
414
}
415

416
GDScriptTokenizer::Token GDScriptTokenizerText::make_identifier(const StringName &p_identifier) {
417
	Token identifier = make_token(Token::IDENTIFIER);
418
	identifier.literal = p_identifier;
419
	return identifier;
420
}
421

422
GDScriptTokenizer::Token GDScriptTokenizerText::make_error(const String &p_message) {
423
	Token error = make_token(Token::ERROR);
424
	error.literal = p_message;
425

426
	return error;
427
}
428

429
void GDScriptTokenizerText::push_error(const String &p_message) {
430
	Token error = make_error(p_message);
431
	error_stack.push_back(error);
432
}
433

434
void GDScriptTokenizerText::push_error(const Token &p_error) {
435
	error_stack.push_back(p_error);
436
}
437

438
GDScriptTokenizer::Token GDScriptTokenizerText::make_paren_error(char32_t p_paren) {
439
	if (paren_stack.is_empty()) {
440
		return make_error(vformat("Closing \"%c\" doesn't have an opening counterpart.", p_paren));
441
	}
442
	Token error = make_error(vformat("Closing \"%c\" doesn't match the opening \"%c\".", p_paren, paren_stack.back()->get()));
443
	paren_stack.pop_back(); // Remove opening one anyway.
444
	return error;
445
}
446

447
GDScriptTokenizer::Token GDScriptTokenizerText::check_vcs_marker(char32_t p_test, Token::Type p_double_type) {
448
	const char32_t *next = _current + 1;
449
	int chars = 2; // Two already matched.
450

451
	// Test before consuming characters, since we don't want to consume more than needed.
452
	while (*next == p_test) {
453
		chars++;
454
		next++;
455
	}
456
	if (chars >= 7) {
457
		// It is a VCS conflict marker.
458
		while (chars > 1) {
459
			// Consume all characters (first was already consumed by scan()).
460
			_advance();
461
			chars--;
462
		}
463
		return make_token(Token::VCS_CONFLICT_MARKER);
464
	} else {
465
		// It is only a regular double character token, so we consume the second character.
466
		_advance();
467
		return make_token(p_double_type);
468
	}
469
}
470

471
GDScriptTokenizer::Token GDScriptTokenizerText::annotation() {
472
	if (is_unicode_identifier_start(_peek())) {
473
		_advance(); // Consume start character.
474
	} else {
475
		push_error("Expected annotation identifier after \"@\".");
476
	}
477
	while (is_unicode_identifier_continue(_peek())) {
478
		// Consume all identifier characters.
479
		_advance();
480
	}
481
	Token annotation = make_token(Token::ANNOTATION);
482
	annotation.literal = StringName(annotation.source);
483
	return annotation;
484
}
485

486
#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \
487
	KEYWORD_GROUP('a') \
488
	KEYWORD("as", Token::AS) \
489
	KEYWORD("and", Token::AND) \
490
	KEYWORD("assert", Token::ASSERT) \
491
	KEYWORD("await", Token::AWAIT) \
492
	KEYWORD_GROUP('b') \
493
	KEYWORD("break", Token::BREAK) \
494
	KEYWORD("breakpoint", Token::BREAKPOINT) \
495
	KEYWORD_GROUP('c') \
496
	KEYWORD("class", Token::CLASS) \
497
	KEYWORD("class_name", Token::CLASS_NAME) \
498
	KEYWORD("const", Token::TK_CONST) \
499
	KEYWORD("continue", Token::CONTINUE) \
500
	KEYWORD_GROUP('e') \
501
	KEYWORD("elif", Token::ELIF) \
502
	KEYWORD("else", Token::ELSE) \
503
	KEYWORD("enum", Token::ENUM) \
504
	KEYWORD("extends", Token::EXTENDS) \
505
	KEYWORD_GROUP('f') \
506
	KEYWORD("for", Token::FOR) \
507
	KEYWORD("func", Token::FUNC) \
508
	KEYWORD_GROUP('i') \
509
	KEYWORD("if", Token::IF) \
510
	KEYWORD("in", Token::TK_IN) \
511
	KEYWORD("is", Token::IS) \
512
	KEYWORD_GROUP('m') \
513
	KEYWORD("match", Token::MATCH) \
514
	KEYWORD_GROUP('n') \
515
	KEYWORD("namespace", Token::NAMESPACE) \
516
	KEYWORD("not", Token::NOT) \
517
	KEYWORD_GROUP('o') \
518
	KEYWORD("or", Token::OR) \
519
	KEYWORD_GROUP('p') \
520
	KEYWORD("pass", Token::PASS) \
521
	KEYWORD("preload", Token::PRELOAD) \
522
	KEYWORD_GROUP('r') \
523
	KEYWORD("return", Token::RETURN) \
524
	KEYWORD_GROUP('s') \
525
	KEYWORD("self", Token::SELF) \
526
	KEYWORD("signal", Token::SIGNAL) \
527
	KEYWORD("static", Token::STATIC) \
528
	KEYWORD("super", Token::SUPER) \
529
	KEYWORD_GROUP('t') \
530
	KEYWORD("trait", Token::TRAIT) \
531
	KEYWORD_GROUP('v') \
532
	KEYWORD("var", Token::VAR) \
533
	KEYWORD("void", Token::TK_VOID) \
534
	KEYWORD_GROUP('w') \
535
	KEYWORD("while", Token::WHILE) \
536
	KEYWORD("when", Token::WHEN) \
537
	KEYWORD_GROUP('y') \
538
	KEYWORD("yield", Token::YIELD) \
539
	KEYWORD_GROUP('I') \
540
	KEYWORD("INF", Token::CONST_INF) \
541
	KEYWORD_GROUP('N') \
542
	KEYWORD("NAN", Token::CONST_NAN) \
543
	KEYWORD_GROUP('P') \
544
	KEYWORD("PI", Token::CONST_PI) \
545
	KEYWORD_GROUP('T') \
546
	KEYWORD("TAU", Token::CONST_TAU)
547

548
#define MIN_KEYWORD_LENGTH 2
549
#define MAX_KEYWORD_LENGTH 10
550

551
#ifdef DEBUG_ENABLED
552
void GDScriptTokenizerText::make_keyword_list() {
553
#define KEYWORD_LINE(keyword, token_type) keyword,
554
#define KEYWORD_GROUP_IGNORE(group)
555
	keyword_list = {
556
		KEYWORDS(KEYWORD_GROUP_IGNORE, KEYWORD_LINE)
557
	};
558
#undef KEYWORD_LINE
559
#undef KEYWORD_GROUP_IGNORE
560
}
561
#endif // DEBUG_ENABLED
562

563
GDScriptTokenizer::Token GDScriptTokenizerText::potential_identifier() {
564
	bool only_ascii = _peek(-1) < 128;
565

566
	// Consume all identifier characters.
567
	while (is_unicode_identifier_continue(_peek())) {
568
		char32_t c = _advance();
569
		only_ascii = only_ascii && c < 128;
570
	}
571

572
	int len = _current - _start;
573

574
	if (len == 1 && _peek(-1) == '_') {
575
		// Lone underscore.
576
		Token token = make_token(Token::UNDERSCORE);
577
		token.literal = "_";
578
		return token;
579
	}
580

581
	String name = String::utf32(Span(_start, len));
582
	if (len < MIN_KEYWORD_LENGTH || len > MAX_KEYWORD_LENGTH) {
583
		// Cannot be a keyword, as the length doesn't match any.
584
		return make_identifier(name);
585
	}
586

587
	if (!only_ascii) {
588
		// Kept here in case the order with push_error matters.
589
		Token id = make_identifier(name);
590

591
#ifdef DEBUG_ENABLED
592
		// Additional checks for identifiers but only in debug and if it's available in TextServer.
593
		if (TS->has_feature(TextServer::FEATURE_UNICODE_SECURITY)) {
594
			int64_t confusable = TS->is_confusable(name, keyword_list);
595
			if (confusable >= 0) {
596
				push_error(vformat(R"(Identifier "%s" is visually similar to the GDScript keyword "%s" and thus not allowed.)", name, keyword_list[confusable]));
597
			}
598
		}
599
#endif // DEBUG_ENABLED
600

601
		// Cannot be a keyword, as keywords are ASCII only.
602
		return id;
603
	}
604

605
	// Define some helper macros for the switch case.
606
#define KEYWORD_GROUP_CASE(char) \
607
	break; \
608
	case char:
609
#define KEYWORD(keyword, token_type) \
610
	{ \
611
		const int keyword_length = sizeof(keyword) - 1; \
612
		static_assert(keyword_length <= MAX_KEYWORD_LENGTH, "There's a keyword longer than the defined maximum length"); \
613
		static_assert(keyword_length >= MIN_KEYWORD_LENGTH, "There's a keyword shorter than the defined minimum length"); \
614
		if (keyword_length == len && name == keyword) { \
615
			Token kw = make_token(token_type); \
616
			kw.literal = name; \
617
			return kw; \
618
		} \
619
	}
620

621
	// Find if it's a keyword.
622
	switch (_start[0]) {
623
		default:
624
			KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
625
			break;
626
	}
627

628
	// Check if it's a special literal
629
	if (len == 4) {
630
		if (name == "true") {
631
			return make_literal(true);
632
		} else if (name == "null") {
633
			return make_literal(Variant());
634
		}
635
	} else if (len == 5) {
636
		if (name == "false") {
637
			return make_literal(false);
638
		}
639
	}
640

641
	// Not a keyword, so must be an identifier.
642
	return make_identifier(name);
643

644
#undef KEYWORD_GROUP_CASE
645
#undef KEYWORD
646
}
647

648
#undef MAX_KEYWORD_LENGTH
649
#undef MIN_KEYWORD_LENGTH
650
#undef KEYWORDS
651

652
void GDScriptTokenizerText::newline(bool p_make_token) {
653
	// Don't overwrite previous newline, nor create if we want a line continuation.
654
	if (p_make_token && !pending_newline && !line_continuation) {
655
		Token newline(Token::NEWLINE);
656
		newline.start_line = line;
657
		newline.end_line = line;
658
		newline.start_column = column - 1;
659
		newline.end_column = column;
660
		pending_newline = true;
661
		last_token = newline;
662
		last_newline = newline;
663
	}
664

665
	// Increment line/column counters.
666
	line++;
667
	column = 1;
668
}
669

670
GDScriptTokenizer::Token GDScriptTokenizerText::number() {
671
	int base = 10;
672
	bool has_decimal = false;
673
	bool has_exponent = false;
674
	bool has_error = false;
675
	bool need_digits = false;
676
	bool (*digit_check_func)(char32_t) = is_digit;
677

678
	// Sign before hexadecimal or binary.
679
	if ((_peek(-1) == '+' || _peek(-1) == '-') && _peek() == '0') {
680
		_advance();
681
	}
682

683
	if (_peek(-1) == '.') {
684
		has_decimal = true;
685
	} else if (_peek(-1) == '0') {
686
		if (_peek() == 'x' || _peek() == 'X') {
687
			// Hexadecimal.
688
			base = 16;
689
			digit_check_func = is_hex_digit;
690
			need_digits = true;
691
			_advance();
692
		} else if (_peek() == 'b' || _peek() == 'B') {
693
			// Binary.
694
			base = 2;
695
			digit_check_func = is_binary_digit;
696
			need_digits = true;
697
			_advance();
698
		}
699
	}
700

701
	if (base != 10 && is_underscore(_peek())) { // Disallow `0x_` and `0b_`.
702
		Token error = make_error(vformat(R"(Unexpected underscore after "0%c".)", _peek(-1)));
703
		error.start_column = column;
704
		error.end_column = column + 1;
705
		push_error(error);
706
		has_error = true;
707
	}
708
	bool previous_was_underscore = false; // Allow `_` to be used in a number, for readability.
709
	while (digit_check_func(_peek()) || is_underscore(_peek())) {
710
		if (is_underscore(_peek())) {
711
			if (previous_was_underscore) {
712
				Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
713
				error.start_column = column;
714
				error.end_column = column + 1;
715
				push_error(error);
716
			}
717
			previous_was_underscore = true;
718
		} else {
719
			need_digits = false;
720
			previous_was_underscore = false;
721
		}
722
		_advance();
723
	}
724

725
	// It might be a ".." token (instead of decimal point) so we check if it's not.
726
	if (_peek() == '.' && _peek(1) != '.') {
727
		if (base == 10 && !has_decimal) {
728
			has_decimal = true;
729
		} else if (base == 10) {
730
			Token error = make_error("Cannot use a decimal point twice in a number.");
731
			error.start_column = column;
732
			error.end_column = column + 1;
733
			push_error(error);
734
			has_error = true;
735
		} else if (base == 16) {
736
			Token error = make_error("Cannot use a decimal point in a hexadecimal number.");
737
			error.start_column = column;
738
			error.end_column = column + 1;
739
			push_error(error);
740
			has_error = true;
741
		} else {
742
			Token error = make_error("Cannot use a decimal point in a binary number.");
743
			error.start_column = column;
744
			error.end_column = column + 1;
745
			push_error(error);
746
			has_error = true;
747
		}
748
		if (!has_error) {
749
			_advance();
750

751
			// Consume decimal digits.
752
			if (is_underscore(_peek())) { // Disallow `10._`, but allow `10.`.
753
				Token error = make_error(R"(Unexpected underscore after decimal point.)");
754
				error.start_column = column;
755
				error.end_column = column + 1;
756
				push_error(error);
757
				has_error = true;
758
			}
759
			previous_was_underscore = false;
760
			while (is_digit(_peek()) || is_underscore(_peek())) {
761
				if (is_underscore(_peek())) {
762
					if (previous_was_underscore) {
763
						Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
764
						error.start_column = column;
765
						error.end_column = column + 1;
766
						push_error(error);
767
					}
768
					previous_was_underscore = true;
769
				} else {
770
					previous_was_underscore = false;
771
				}
772
				_advance();
773
			}
774
		}
775
	}
776
	if (base == 10) {
777
		if (_peek() == 'e' || _peek() == 'E') {
778
			has_exponent = true;
779
			_advance();
780
			if (_peek() == '+' || _peek() == '-') {
781
				// Exponent sign.
782
				_advance();
783
			}
784
			// Consume exponent digits.
785
			if (!is_digit(_peek())) {
786
				Token error = make_error(R"(Expected exponent value after "e".)");
787
				error.start_column = column;
788
				error.end_column = column + 1;
789
				push_error(error);
790
			}
791
			previous_was_underscore = false;
792
			while (is_digit(_peek()) || is_underscore(_peek())) {
793
				if (is_underscore(_peek())) {
794
					if (previous_was_underscore) {
795
						Token error = make_error(R"(Multiple underscores cannot be adjacent in a numeric literal.)");
796
						error.start_column = column;
797
						error.end_column = column + 1;
798
						push_error(error);
799
					}
800
					previous_was_underscore = true;
801
				} else {
802
					previous_was_underscore = false;
803
				}
804
				_advance();
805
			}
806
		}
807
	}
808

809
	if (need_digits) {
810
		// No digits in hex or bin literal.
811
		Token error = make_error(vformat(R"(Expected %s digit after "0%c".)", (base == 16 ? "hexadecimal" : "binary"), (base == 16 ? 'x' : 'b')));
812
		error.start_column = column;
813
		error.end_column = column + 1;
814
		return error;
815
	}
816

817
	// Detect extra decimal point.
818
	if (!has_error && has_decimal && _peek() == '.' && _peek(1) != '.') {
819
		Token error = make_error("Cannot use a decimal point twice in a number.");
820
		error.start_column = column;
821
		error.end_column = column + 1;
822
		push_error(error);
823
		has_error = true;
824
	} else if (is_unicode_identifier_start(_peek()) || is_unicode_identifier_continue(_peek())) {
825
		// Letter at the end of the number.
826
		push_error("Invalid numeric notation.");
827
	}
828

829
	// Create a string with the whole number.
830
	int len = _current - _start;
831
	String number = String::utf32(Span(_start, len)).remove_char('_');
832

833
	// Convert to the appropriate literal type.
834
	if (base == 16) {
835
		int64_t value = number.hex_to_int();
836
		return make_literal(value);
837
	} else if (base == 2) {
838
		int64_t value = number.bin_to_int();
839
		return make_literal(value);
840
	} else if (has_decimal || has_exponent) {
841
		double value = number.to_float();
842
		return make_literal(value);
843
	} else {
844
		int64_t value = number.to_int();
845
		return make_literal(value);
846
	}
847
}
848

849
GDScriptTokenizer::Token GDScriptTokenizerText::string() {
850
	enum StringType {
851
		STRING_REGULAR,
852
		STRING_NAME,
853
		STRING_NODEPATH,
854
	};
855

856
	bool is_raw = false;
857
	bool is_multiline = false;
858
	StringType type = STRING_REGULAR;
859

860
	if (_peek(-1) == 'r') {
861
		is_raw = true;
862
		_advance();
863
	} else if (_peek(-1) == '&') {
864
		type = STRING_NAME;
865
		_advance();
866
	} else if (_peek(-1) == '^') {
867
		type = STRING_NODEPATH;
868
		_advance();
869
	}
870

871
	char32_t quote_char = _peek(-1);
872

873
	if (_peek() == quote_char && _peek(1) == quote_char) {
874
		is_multiline = true;
875
		// Consume all quotes.
876
		_advance();
877
		_advance();
878
	}
879

880
	String result;
881
	char32_t prev = 0;
882
	int prev_pos = 0;
883

884
	for (;;) {
885
		// Consume actual string.
886
		if (_is_at_end()) {
887
			return make_error("Unterminated string.");
888
		}
889

890
		char32_t ch = _peek();
891

892
		if (ch == 0x200E || ch == 0x200F || (ch >= 0x202A && ch <= 0x202E) || (ch >= 0x2066 && ch <= 0x2069)) {
893
			Token error;
894
			if (is_raw) {
895
				error = make_error("Invisible text direction control character present in the string, use regular string literal instead of r-string.");
896
			} else {
897
				error = make_error("Invisible text direction control character present in the string, escape it (\"\\u" + String::num_int64(ch, 16) + "\") to avoid confusion.");
898
			}
899
			error.start_column = column;
900
			error.end_column = column + 1;
901
			push_error(error);
902
		}
903

904
		if (ch == '\\') {
905
			// Escape pattern.
906
			_advance();
907
			if (_is_at_end()) {
908
				return make_error("Unterminated string.");
909
			}
910

911
			if (is_raw) {
912
				if (_peek() == quote_char) {
913
					_advance();
914
					if (_is_at_end()) {
915
						return make_error("Unterminated string.");
916
					}
917
					result += '\\';
918
					result += quote_char;
919
				} else if (_peek() == '\\') { // For `\\\"`.
920
					_advance();
921
					if (_is_at_end()) {
922
						return make_error("Unterminated string.");
923
					}
924
					result += '\\';
925
					result += '\\';
926
				} else {
927
					result += '\\';
928
				}
929
			} else {
930
				// Grab escape character.
931
				char32_t code = _peek();
932
				_advance();
933
				if (_is_at_end()) {
934
					return make_error("Unterminated string.");
935
				}
936

937
				char32_t escaped = 0;
938
				bool valid_escape = true;
939

940
				switch (code) {
941
					case 'a':
942
						escaped = '\a';
943
						break;
944
					case 'b':
945
						escaped = '\b';
946
						break;
947
					case 'f':
948
						escaped = '\f';
949
						break;
950
					case 'n':
951
						escaped = '\n';
952
						break;
953
					case 'r':
954
						escaped = '\r';
955
						break;
956
					case 't':
957
						escaped = '\t';
958
						break;
959
					case 'v':
960
						escaped = '\v';
961
						break;
962
					case '\'':
963
						escaped = '\'';
964
						break;
965
					case '\"':
966
						escaped = '\"';
967
						break;
968
					case '\\':
969
						escaped = '\\';
970
						break;
971
					case 'U':
972
					case 'u': {
973
						// Hexadecimal sequence.
974
						int hex_len = (code == 'U') ? 6 : 4;
975
						for (int j = 0; j < hex_len; j++) {
976
							if (_is_at_end()) {
977
								return make_error("Unterminated string.");
978
							}
979

980
							char32_t digit = _peek();
981
							char32_t value = 0;
982
							if (is_digit(digit)) {
983
								value = digit - '0';
984
							} else if (digit >= 'a' && digit <= 'f') {
985
								value = digit - 'a';
986
								value += 10;
987
							} else if (digit >= 'A' && digit <= 'F') {
988
								value = digit - 'A';
989
								value += 10;
990
							} else {
991
								// Make error, but keep parsing the string.
992
								Token error = make_error("Invalid hexadecimal digit in unicode escape sequence.");
993
								error.start_column = column;
994
								error.end_column = column + 1;
995
								push_error(error);
996
								valid_escape = false;
997
								break;
998
							}
999

1000
							escaped <<= 4;
1001
							escaped |= value;
1002

1003
							_advance();
1004
						}
1005
					} break;
1006
					case '\r':
1007
						if (_peek() != '\n') {
1008
							// Carriage return without newline in string. (???)
1009
							// Just add it to the string and keep going.
1010
							result += ch;
1011
							_advance();
1012
							break;
1013
						}
1014
						[[fallthrough]];
1015
					case '\n':
1016
						// Escaping newline.
1017
						newline(false);
1018
						valid_escape = false; // Don't add to the string.
1019
						break;
1020
					default:
1021
						Token error = make_error("Invalid escape in string.");
1022
						error.start_column = column - 2;
1023
						push_error(error);
1024
						valid_escape = false;
1025
						break;
1026
				}
1027
				// Parse UTF-16 pair.
1028
				if (valid_escape) {
1029
					if ((escaped & 0xfffffc00) == 0xd800) {
1030
						if (prev == 0) {
1031
							prev = escaped;
1032
							prev_pos = column - 2;
1033
							continue;
1034
						} else {
1035
							Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
1036
							error.start_column = column - 2;
1037
							push_error(error);
1038
							valid_escape = false;
1039
							prev = 0;
1040
						}
1041
					} else if ((escaped & 0xfffffc00) == 0xdc00) {
1042
						if (prev == 0) {
1043
							Token error = make_error("Invalid UTF-16 sequence in string, unpaired trail surrogate.");
1044
							error.start_column = column - 2;
1045
							push_error(error);
1046
							valid_escape = false;
1047
						} else {
1048
							escaped = (prev << 10UL) + escaped - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
1049
							prev = 0;
1050
						}
1051
					}
1052
					if (prev != 0) {
1053
						Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate.");
1054
						error.start_column = prev_pos;
1055
						push_error(error);
1056
						prev = 0;
1057
					}
1058
				}
1059

1060
				if (valid_escape) {
1061
					result += escaped;
1062
				}
1063
			}
1064
		} else if (ch == quote_char) {
1065
			if (prev != 0) {
1066
				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1067
				error.start_column = prev_pos;
1068
				push_error(error);
1069
				prev = 0;
1070
			}
1071
			_advance();
1072
			if (is_multiline) {
1073
				if (_peek() == quote_char && _peek(1) == quote_char) {
1074
					// Ended the multiline string. Consume all quotes.
1075
					_advance();
1076
					_advance();
1077
					break;
1078
				} else {
1079
					// Not a multiline string termination, add consumed quote.
1080
					result += quote_char;
1081
				}
1082
			} else {
1083
				// Ended single-line string.
1084
				break;
1085
			}
1086
		} else {
1087
			if (prev != 0) {
1088
				Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1089
				error.start_column = prev_pos;
1090
				push_error(error);
1091
				prev = 0;
1092
			}
1093
			result += ch;
1094
			_advance();
1095
			if (ch == '\n') {
1096
				newline(false);
1097
			}
1098
		}
1099
	}
1100
	if (prev != 0) {
1101
		Token error = make_error("Invalid UTF-16 sequence in string, unpaired lead surrogate");
1102
		error.start_column = prev_pos;
1103
		push_error(error);
1104
		prev = 0;
1105
	}
1106

1107
	// Make the literal.
1108
	Variant string;
1109
	switch (type) {
1110
		case STRING_NAME:
1111
			string = StringName(result);
1112
			break;
1113
		case STRING_NODEPATH:
1114
			string = NodePath(result);
1115
			break;
1116
		case STRING_REGULAR:
1117
			string = result;
1118
			break;
1119
	}
1120

1121
	return make_literal(string);
1122
}
1123

1124
void GDScriptTokenizerText::check_indent() {
1125
	ERR_FAIL_COND_MSG(column != 1, "Checking tokenizer indentation in the middle of a line.");
1126

1127
	if (_is_at_end()) {
1128
		// Send dedents for every indent level.
1129
		pending_indents -= indent_level();
1130
		indent_stack.clear();
1131
		return;
1132
	}
1133

1134
	for (;;) {
1135
		char32_t current_indent_char = _peek();
1136
		int indent_count = 0;
1137

1138
		if (current_indent_char != ' ' && current_indent_char != '\t' && current_indent_char != '\r' && current_indent_char != '\n' && current_indent_char != '#') {
1139
			// First character of the line is not whitespace, so we clear all indentation levels.
1140
			// Unless we are in a continuation or in multiline mode (inside expression).
1141
			if (line_continuation || multiline_mode) {
1142
				return;
1143
			}
1144
			pending_indents -= indent_level();
1145
			indent_stack.clear();
1146
			return;
1147
		}
1148

1149
		if (_peek() == '\r') {
1150
			_advance();
1151
			if (_peek() != '\n') {
1152
				push_error("Stray carriage return character in source code.");
1153
			}
1154
		}
1155
		if (_peek() == '\n') {
1156
			// Empty line, keep going.
1157
			_advance();
1158
			newline(false);
1159
			continue;
1160
		}
1161

1162
		// Check indent level.
1163
		bool mixed = false;
1164
		while (!_is_at_end()) {
1165
			char32_t space = _peek();
1166
			if (space == '\t') {
1167
				// Consider individual tab columns.
1168
				column += tab_size - 1;
1169
				indent_count += tab_size;
1170
			} else if (space == ' ') {
1171
				indent_count += 1;
1172
			} else {
1173
				break;
1174
			}
1175
			mixed = mixed || space != current_indent_char;
1176
			_advance();
1177
		}
1178

1179
		if (_is_at_end()) {
1180
			// Reached the end with an empty line, so just dedent as much as needed.
1181
			pending_indents -= indent_level();
1182
			indent_stack.clear();
1183
			return;
1184
		}
1185

1186
		if (_peek() == '\r') {
1187
			_advance();
1188
			if (_peek() != '\n') {
1189
				push_error("Stray carriage return character in source code.");
1190
			}
1191
		}
1192
		if (_peek() == '\n') {
1193
			// Empty line, keep going.
1194
			_advance();
1195
			newline(false);
1196
			continue;
1197
		}
1198
		if (_peek() == '#') {
1199
			// Comment. Advance to the next line.
1200
#ifdef TOOLS_ENABLED
1201
			String comment;
1202
			while (_peek() != '\n' && !_is_at_end()) {
1203
				comment += _advance();
1204
			}
1205
			comments[line] = CommentData(comment, true);
1206
#else
1207
			while (_peek() != '\n' && !_is_at_end()) {
1208
				_advance();
1209
			}
1210
#endif // TOOLS_ENABLED
1211
			if (_is_at_end()) {
1212
				// Reached the end with an empty line, so just dedent as much as needed.
1213
				pending_indents -= indent_level();
1214
				indent_stack.clear();
1215
				return;
1216
			}
1217
			_advance(); // Consume '\n'.
1218
			newline(false);
1219
			continue;
1220
		}
1221

1222
		if (mixed && !line_continuation && !multiline_mode) {
1223
			Token error = make_error("Mixed use of tabs and spaces for indentation.");
1224
			error.start_line = line;
1225
			error.start_column = 1;
1226
			push_error(error);
1227
		}
1228

1229
		if (line_continuation || multiline_mode) {
1230
			// We cleared up all the whitespace at the beginning of the line.
1231
			// If this is a line continuation or we're in multiline mode then we don't want any indentation changes.
1232
			return;
1233
		}
1234

1235
		// Check if indentation character is consistent.
1236
		if (indent_char == '\0') {
1237
			// First time indenting, choose character now.
1238
			indent_char = current_indent_char;
1239
		} else if (current_indent_char != indent_char) {
1240
			Token error = make_error(vformat("Used %s character for indentation instead of %s as used before in the file.",
1241
					_get_indent_char_name(current_indent_char), _get_indent_char_name(indent_char)));
1242
			error.start_line = line;
1243
			error.start_column = 1;
1244
			push_error(error);
1245
		}
1246

1247
		// Now we can do actual indentation changes.
1248

1249
		// Check if indent or dedent.
1250
		int previous_indent = 0;
1251
		if (indent_level() > 0) {
1252
			previous_indent = indent_stack.back()->get();
1253
		}
1254
		if (indent_count == previous_indent) {
1255
			// No change in indentation.
1256
			return;
1257
		}
1258
		if (indent_count > previous_indent) {
1259
			// Indentation increased.
1260
			indent_stack.push_back(indent_count);
1261
			pending_indents++;
1262
		} else {
1263
			// Indentation decreased (dedent).
1264
			if (indent_level() == 0) {
1265
				push_error("Tokenizer bug: trying to dedent without previous indent.");
1266
				return;
1267
			}
1268
			while (indent_level() > 0 && indent_stack.back()->get() > indent_count) {
1269
				indent_stack.pop_back();
1270
				pending_indents--;
1271
			}
1272
			if ((indent_level() > 0 && indent_stack.back()->get() != indent_count) || (indent_level() == 0 && indent_count != 0)) {
1273
				// Mismatched indentation alignment.
1274
				Token error = make_error("Unindent doesn't match the previous indentation level.");
1275
				error.start_line = line;
1276
				error.start_column = 1;
1277
				error.end_column = column + 1;
1278
				push_error(error);
1279
				// Still, we'll be lenient and keep going, so keep this level in the stack.
1280
				indent_stack.push_back(indent_count);
1281
			}
1282
		}
1283
		break; // Get out of the loop in any case.
1284
	}
1285
}
1286

1287
String GDScriptTokenizerText::_get_indent_char_name(char32_t ch) {
1288
	ERR_FAIL_COND_V(ch != ' ' && ch != '\t', String::chr(ch).c_escape());
1289

1290
	return ch == ' ' ? "space" : "tab";
1291
}
1292

1293
void GDScriptTokenizerText::_skip_whitespace() {
1294
	if (pending_indents != 0) {
1295
		// Still have some indent/dedent tokens to give.
1296
		return;
1297
	}
1298

1299
	bool is_bol = column == 1; // Beginning of line.
1300

1301
	if (is_bol) {
1302
		check_indent();
1303
		return;
1304
	}
1305

1306
	for (;;) {
1307
		char32_t c = _peek();
1308
		switch (c) {
1309
			case ' ':
1310
				_advance();
1311
				break;
1312
			case '\t':
1313
				_advance();
1314
				// Consider individual tab columns.
1315
				column += tab_size - 1;
1316
				break;
1317
			case '\r':
1318
				_advance(); // Consume either way.
1319
				if (_peek() != '\n') {
1320
					push_error("Stray carriage return character in source code.");
1321
					return;
1322
				}
1323
				break;
1324
			case '\n':
1325
				_advance();
1326
				newline(!is_bol); // Don't create new line token if line is empty.
1327
				check_indent();
1328
				break;
1329
			case '#': {
1330
				// Comment.
1331
#ifdef TOOLS_ENABLED
1332
				String comment;
1333
				while (_peek() != '\n' && !_is_at_end()) {
1334
					comment += _advance();
1335
				}
1336
				comments[line] = CommentData(comment, is_bol);
1337
#else
1338
				while (_peek() != '\n' && !_is_at_end()) {
1339
					_advance();
1340
				}
1341
#endif // TOOLS_ENABLED
1342
				if (_is_at_end()) {
1343
					return;
1344
				}
1345
				_advance(); // Consume '\n'
1346
				newline(!is_bol);
1347
				check_indent();
1348
			} break;
1349
			default:
1350
				return;
1351
		}
1352
	}
1353
}
1354

1355
GDScriptTokenizer::Token GDScriptTokenizerText::scan() {
1356
	if (has_error()) {
1357
		return pop_error();
1358
	}
1359

1360
	_skip_whitespace();
1361

1362
	if (pending_newline) {
1363
		pending_newline = false;
1364
		if (!multiline_mode) {
1365
			// Don't return newline tokens on multiline mode.
1366
			return last_newline;
1367
		}
1368
	}
1369

1370
	// Check for potential errors after skipping whitespace().
1371
	if (has_error()) {
1372
		return pop_error();
1373
	}
1374

1375
	_start = _current;
1376
	start_line = line;
1377
	start_column = column;
1378

1379
	if (pending_indents != 0) {
1380
		// Adjust position for indent.
1381
		_start -= start_column - 1;
1382
		start_column = 1;
1383
		if (pending_indents > 0) {
1384
			// Indents.
1385
			pending_indents--;
1386
			return make_token(Token::INDENT);
1387
		} else {
1388
			// Dedents.
1389
			pending_indents++;
1390
			Token dedent = make_token(Token::DEDENT);
1391
			dedent.end_column += 1;
1392
			return dedent;
1393
		}
1394
	}
1395

1396
	if (_is_at_end()) {
1397
		return make_token(Token::TK_EOF);
1398
	}
1399

1400
	const char32_t c = _advance();
1401

1402
	if (c == '\\') {
1403
		// Line continuation with backslash.
1404
		if (_peek() == '\r') {
1405
			if (_peek(1) != '\n') {
1406
				return make_error("Unexpected carriage return character.");
1407
			}
1408
			_advance();
1409
		}
1410
		if (_peek() != '\n') {
1411
			return make_error("Expected new line after \"\\\".");
1412
		}
1413
		_advance();
1414
		newline(false);
1415
		line_continuation = true;
1416
		_skip_whitespace(); // Skip whitespace/comment lines after `\`. See GH-89403.
1417
		continuation_lines.push_back(line);
1418
		return scan(); // Recurse to get next token.
1419
	}
1420

1421
	line_continuation = false;
1422

1423
	if (is_digit(c)) {
1424
		return number();
1425
	} else if (c == 'r' && (_peek() == '"' || _peek() == '\'')) {
1426
		// Raw string literals.
1427
		return string();
1428
	} else if (is_unicode_identifier_start(c)) {
1429
		return potential_identifier();
1430
	}
1431

1432
	switch (c) {
1433
		// String literals.
1434
		case '"':
1435
		case '\'':
1436
			return string();
1437

1438
		// Annotation.
1439
		case '@':
1440
			return annotation();
1441

1442
		// Single characters.
1443
		case '~':
1444
			return make_token(Token::TILDE);
1445
		case ',':
1446
			return make_token(Token::COMMA);
1447
		case ':':
1448
			return make_token(Token::COLON);
1449
		case ';':
1450
			return make_token(Token::SEMICOLON);
1451
		case '$':
1452
			return make_token(Token::DOLLAR);
1453
		case '?':
1454
			return make_token(Token::QUESTION_MARK);
1455
		case '`':
1456
			return make_token(Token::BACKTICK);
1457

1458
		// Parens.
1459
		case '(':
1460
			push_paren('(');
1461
			return make_token(Token::PARENTHESIS_OPEN);
1462
		case '[':
1463
			push_paren('[');
1464
			return make_token(Token::BRACKET_OPEN);
1465
		case '{':
1466
			push_paren('{');
1467
			return make_token(Token::BRACE_OPEN);
1468
		case ')':
1469
			if (!pop_paren('(')) {
1470
				return make_paren_error(c);
1471
			}
1472
			return make_token(Token::PARENTHESIS_CLOSE);
1473
		case ']':
1474
			if (!pop_paren('[')) {
1475
				return make_paren_error(c);
1476
			}
1477
			return make_token(Token::BRACKET_CLOSE);
1478
		case '}':
1479
			if (!pop_paren('{')) {
1480
				return make_paren_error(c);
1481
			}
1482
			return make_token(Token::BRACE_CLOSE);
1483

1484
		// Double characters.
1485
		case '!':
1486
			if (_peek() == '=') {
1487
				_advance();
1488
				return make_token(Token::BANG_EQUAL);
1489
			} else {
1490
				return make_token(Token::BANG);
1491
			}
1492
		case '.':
1493
			if (_peek() == '.') {
1494
				_advance();
1495
				if (_peek() == '.') {
1496
					_advance();
1497
					return make_token(Token::PERIOD_PERIOD_PERIOD);
1498
				}
1499
				return make_token(Token::PERIOD_PERIOD);
1500
			} else if (is_digit(_peek())) {
1501
				// Number starting with '.'.
1502
				return number();
1503
			} else {
1504
				return make_token(Token::PERIOD);
1505
			}
1506
		case '+':
1507
			if (_peek() == '=') {
1508
				_advance();
1509
				return make_token(Token::PLUS_EQUAL);
1510
			} else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
1511
				// Number starting with '+'.
1512
				return number();
1513
			} else {
1514
				return make_token(Token::PLUS);
1515
			}
1516
		case '-':
1517
			if (_peek() == '=') {
1518
				_advance();
1519
				return make_token(Token::MINUS_EQUAL);
1520
			} else if (is_digit(_peek()) && !last_token.can_precede_bin_op()) {
1521
				// Number starting with '-'.
1522
				return number();
1523
			} else if (_peek() == '>') {
1524
				_advance();
1525
				return make_token(Token::FORWARD_ARROW);
1526
			} else {
1527
				return make_token(Token::MINUS);
1528
			}
1529
		case '*':
1530
			if (_peek() == '=') {
1531
				_advance();
1532
				return make_token(Token::STAR_EQUAL);
1533
			} else if (_peek() == '*') {
1534
				if (_peek(1) == '=') {
1535
					_advance();
1536
					_advance(); // Advance both '*' and '='
1537
					return make_token(Token::STAR_STAR_EQUAL);
1538
				}
1539
				_advance();
1540
				return make_token(Token::STAR_STAR);
1541
			} else {
1542
				return make_token(Token::STAR);
1543
			}
1544
		case '/':
1545
			if (_peek() == '=') {
1546
				_advance();
1547
				return make_token(Token::SLASH_EQUAL);
1548
			} else {
1549
				return make_token(Token::SLASH);
1550
			}
1551
		case '%':
1552
			if (_peek() == '=') {
1553
				_advance();
1554
				return make_token(Token::PERCENT_EQUAL);
1555
			} else {
1556
				return make_token(Token::PERCENT);
1557
			}
1558
		case '^':
1559
			if (_peek() == '=') {
1560
				_advance();
1561
				return make_token(Token::CARET_EQUAL);
1562
			} else if (_peek() == '"' || _peek() == '\'') {
1563
				// Node path
1564
				return string();
1565
			} else {
1566
				return make_token(Token::CARET);
1567
			}
1568
		case '&':
1569
			if (_peek() == '&') {
1570
				_advance();
1571
				return make_token(Token::AMPERSAND_AMPERSAND);
1572
			} else if (_peek() == '=') {
1573
				_advance();
1574
				return make_token(Token::AMPERSAND_EQUAL);
1575
			} else if (_peek() == '"' || _peek() == '\'') {
1576
				// String Name
1577
				return string();
1578
			} else {
1579
				return make_token(Token::AMPERSAND);
1580
			}
1581
		case '|':
1582
			if (_peek() == '|') {
1583
				_advance();
1584
				return make_token(Token::PIPE_PIPE);
1585
			} else if (_peek() == '=') {
1586
				_advance();
1587
				return make_token(Token::PIPE_EQUAL);
1588
			} else {
1589
				return make_token(Token::PIPE);
1590
			}
1591

1592
		// Potential VCS conflict markers.
1593
		case '=':
1594
			if (_peek() == '=') {
1595
				return check_vcs_marker('=', Token::EQUAL_EQUAL);
1596
			} else {
1597
				return make_token(Token::EQUAL);
1598
			}
1599
		case '<':
1600
			if (_peek() == '=') {
1601
				_advance();
1602
				return make_token(Token::LESS_EQUAL);
1603
			} else if (_peek() == '<') {
1604
				if (_peek(1) == '=') {
1605
					_advance();
1606
					_advance(); // Advance both '<' and '='
1607
					return make_token(Token::LESS_LESS_EQUAL);
1608
				} else {
1609
					return check_vcs_marker('<', Token::LESS_LESS);
1610
				}
1611
			} else {
1612
				return make_token(Token::LESS);
1613
			}
1614
		case '>':
1615
			if (_peek() == '=') {
1616
				_advance();
1617
				return make_token(Token::GREATER_EQUAL);
1618
			} else if (_peek() == '>') {
1619
				if (_peek(1) == '=') {
1620
					_advance();
1621
					_advance(); // Advance both '>' and '='
1622
					return make_token(Token::GREATER_GREATER_EQUAL);
1623
				} else {
1624
					return check_vcs_marker('>', Token::GREATER_GREATER);
1625
				}
1626
			} else {
1627
				return make_token(Token::GREATER);
1628
			}
1629

1630
		default:
1631
			if (is_whitespace(c)) {
1632
				return make_error(vformat(R"(Invalid white space character U+%04X.)", static_cast<int32_t>(c)));
1633
			} else {
1634
				return make_error(vformat(R"(Invalid character "%c" (U+%04X).)", c, static_cast<int32_t>(c)));
1635
			}
1636
	}
1637
}
1638

1639
GDScriptTokenizerText::GDScriptTokenizerText() {
1640
#ifdef TOOLS_ENABLED
1641
	if (EditorSettings::get_singleton()) {
1642
		tab_size = EditorSettings::get_singleton()->get_setting("text_editor/behavior/indent/size");
1643
	}
1644
#endif // TOOLS_ENABLED
1645
#ifdef DEBUG_ENABLED
1646
	make_keyword_list();
1647
#endif // DEBUG_ENABLED
1648
}
1649

1650
Product

Resources

Company