CoCalc -- scanner.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/test/simulation/fixtures/codeMapper/scanner.ts
¹³³⁹⁹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5
'use strict';
6

7
import { JSONScanner, ScanError, SyntaxKind } from './scannerTypes';
8

9
/**
10
 * Creates a JSON scanner on the given text.
11
 * If ignoreTrivia is set, whitespaces or comments are ignored.
12
 */
13
export function createScanner(text: string, ignoreTrivia: boolean = false): JSONScanner {
14

15
	const len = text.length;
16
	let pos = 0,
17
		value: string = '',
18
		tokenOffset = 0,
19
		token: SyntaxKind = SyntaxKind.Unknown,
20
		lineNumber = 0,
21
		lineStartOffset = 0,
22
		tokenLineStartOffset = 0,
23
		prevTokenLineStartOffset = 0,
24
		scanError: ScanError = ScanError.None;
25

26
	function scanHexDigits(count: number, exact?: boolean): number {
27
		let digits = 0;
28
		let value = 0;
29
		while (digits < count || !exact) {
30
			let ch = text.charCodeAt(pos);
31
			if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
32
				value = value * 16 + ch - CharacterCodes._0;
33
			}
34
			else if (ch >= CharacterCodes.A && ch <= CharacterCodes.F) {
35
				value = value * 16 + ch - CharacterCodes.A + 10;
36
			}
37
			else if (ch >= CharacterCodes.a && ch <= CharacterCodes.f) {
38
				value = value * 16 + ch - CharacterCodes.a + 10;
39
			}
40
			else {
41
				break;
42
			}
43
			pos++;
44
			digits++;
45
		}
46
		if (digits < count) {
47
			value = -1;
48
		}
49
		return value;
50
	}
51

52
	function setPosition(newPosition: number) {
53
		pos = newPosition;
54
		value = '';
55
		tokenOffset = 0;
56
		token = SyntaxKind.Unknown;
57
		scanError = ScanError.None;
58
	}
59

60
	function scanNumber(): string {
61
		let start = pos;
62
		if (text.charCodeAt(pos) === CharacterCodes._0) {
63
			pos++;
64
		} else {
65
			pos++;
66
			while (pos < text.length && isDigit(text.charCodeAt(pos))) {
67
				pos++;
68
			}
69
		}
70
		if (pos < text.length && text.charCodeAt(pos) === CharacterCodes.dot) {
71
			pos++;
72
			if (pos < text.length && isDigit(text.charCodeAt(pos))) {
73
				pos++;
74
				while (pos < text.length && isDigit(text.charCodeAt(pos))) {
75
					pos++;
76
				}
77
			} else {
78
				scanError = ScanError.UnexpectedEndOfNumber;
79
				return text.substring(start, pos);
80
			}
81
		}
82
		let end = pos;
83
		if (pos < text.length && (text.charCodeAt(pos) === CharacterCodes.E || text.charCodeAt(pos) === CharacterCodes.e)) {
84
			pos++;
85
			if (pos < text.length && text.charCodeAt(pos) === CharacterCodes.plus || text.charCodeAt(pos) === CharacterCodes.minus) {
86
				pos++;
87
			}
88
			if (pos < text.length && isDigit(text.charCodeAt(pos))) {
89
				pos++;
90
				while (pos < text.length && isDigit(text.charCodeAt(pos))) {
91
					pos++;
92
				}
93
				end = pos;
94
			} else {
95
				scanError = ScanError.UnexpectedEndOfNumber;
96
			}
97
		}
98
		return text.substring(start, end);
99
	}
100

101
	function scanString(): string {
102

103
		let result = '',
104
			start = pos;
105

106
		while (true) {
107
			if (pos >= len) {
108
				result += text.substring(start, pos);
109
				scanError = ScanError.UnexpectedEndOfString;
110
				break;
111
			}
112
			const ch = text.charCodeAt(pos);
113
			if (ch === CharacterCodes.doubleQuote) {
114
				result += text.substring(start, pos);
115
				pos++;
116
				break;
117
			}
118
			if (ch === CharacterCodes.backslash) {
119
				result += text.substring(start, pos);
120
				pos++;
121
				if (pos >= len) {
122
					scanError = ScanError.UnexpectedEndOfString;
123
					break;
124
				}
125
				const ch2 = text.charCodeAt(pos++);
126
				switch (ch2) {
127
					case CharacterCodes.doubleQuote:
128
						result += '\"';
129
						break;
130
					case CharacterCodes.backslash:
131
						result += '\\';
132
						break;
133
					case CharacterCodes.slash:
134
						result += '/';
135
						break;
136
					case CharacterCodes.b:
137
						result += '\b';
138
						break;
139
					case CharacterCodes.f:
140
						result += '\f';
141
						break;
142
					case CharacterCodes.n:
143
						result += '\n';
144
						break;
145
					case CharacterCodes.r:
146
						result += '\r';
147
						break;
148
					case CharacterCodes.t:
149
						result += '\t';
150
						break;
151
					case CharacterCodes.u:
152
						const ch3 = scanHexDigits(4, true);
153
						if (ch3 >= 0) {
154
							result += String.fromCharCode(ch3);
155
						} else {
156
							scanError = ScanError.InvalidUnicode;
157
						}
158
						break;
159
					default:
160
						scanError = ScanError.InvalidEscapeCharacter;
161
				}
162
				start = pos;
163
				continue;
164
			}
165
			if (ch >= 0 && ch <= 0x1f) {
166
				if (isLineBreak(ch)) {
167
					result += text.substring(start, pos);
168
					scanError = ScanError.UnexpectedEndOfString;
169
					break;
170
				} else {
171
					scanError = ScanError.InvalidCharacter;
172
					// mark as error but continue with string
173
				}
174
			}
175
			pos++;
176
		}
177
		return result;
178
	}
179

180
	function scanNext(): SyntaxKind {
181

182
		value = '';
183
		scanError = ScanError.None;
184

185
		tokenOffset = pos;
186
		lineStartOffset = lineNumber;
187
		prevTokenLineStartOffset = tokenLineStartOffset;
188

189
		if (pos >= len) {
190
			// at the end
191
			tokenOffset = len;
192
			return token = SyntaxKind.EOF;
193
		}
194

195
		let code = text.charCodeAt(pos);
196
		// trivia: whitespace
197
		if (isWhiteSpace(code)) {
198
			do {
199
				pos++;
200
				value += String.fromCharCode(code);
201
				code = text.charCodeAt(pos);
202
			} while (isWhiteSpace(code));
203

204
			return token = SyntaxKind.Trivia;
205
		}
206

207
		// trivia: newlines
208
		if (isLineBreak(code)) {
209
			pos++;
210
			value += String.fromCharCode(code);
211
			if (code === CharacterCodes.carriageReturn && text.charCodeAt(pos) === CharacterCodes.lineFeed) {
212
				pos++;
213
				value += '\n';
214
			}
215
			lineNumber++;
216
			tokenLineStartOffset = pos;
217
			return token = SyntaxKind.LineBreakTrivia;
218
		}
219

220
		switch (code) {
221
			// tokens: []{}:,
222
			case CharacterCodes.openBrace:
223
				pos++;
224
				return token = SyntaxKind.OpenBraceToken;
225
			case CharacterCodes.closeBrace:
226
				pos++;
227
				return token = SyntaxKind.CloseBraceToken;
228
			case CharacterCodes.openBracket:
229
				pos++;
230
				return token = SyntaxKind.OpenBracketToken;
231
			case CharacterCodes.closeBracket:
232
				pos++;
233
				return token = SyntaxKind.CloseBracketToken;
234
			case CharacterCodes.colon:
235
				pos++;
236
				return token = SyntaxKind.ColonToken;
237
			case CharacterCodes.comma:
238
				pos++;
239
				return token = SyntaxKind.CommaToken;
240

241
			// strings
242
			case CharacterCodes.doubleQuote:
243
				pos++;
244
				value = scanString();
245
				return token = SyntaxKind.StringLiteral;
246

247
			// comments
248
			case CharacterCodes.slash:
249
				const start = pos - 1;
250
				// Single-line comment
251
				if (text.charCodeAt(pos + 1) === CharacterCodes.slash) {
252
					pos += 2;
253

254
					while (pos < len) {
255
						if (isLineBreak(text.charCodeAt(pos))) {
256
							break;
257
						}
258
						pos++;
259

260
					}
261
					value = text.substring(start, pos);
262
					return token = SyntaxKind.LineCommentTrivia;
263
				}
264

265
				// Multi-line comment
266
				if (text.charCodeAt(pos + 1) === CharacterCodes.asterisk) {
267
					pos += 2;
268

269
					const safeLength = len - 1; // For lookahead.
270
					let commentClosed = false;
271
					while (pos < safeLength) {
272
						const ch = text.charCodeAt(pos);
273

274
						if (ch === CharacterCodes.asterisk && text.charCodeAt(pos + 1) === CharacterCodes.slash) {
275
							pos += 2;
276
							commentClosed = true;
277
							break;
278
						}
279

280
						pos++;
281

282
						if (isLineBreak(ch)) {
283
							if (ch === CharacterCodes.carriageReturn && text.charCodeAt(pos) === CharacterCodes.lineFeed) {
284
								pos++;
285
							}
286

287
							lineNumber++;
288
							tokenLineStartOffset = pos;
289
						}
290
					}
291

292
					if (!commentClosed) {
293
						pos++;
294
						scanError = ScanError.UnexpectedEndOfComment;
295
					}
296

297
					value = text.substring(start, pos);
298
					return token = SyntaxKind.BlockCommentTrivia;
299
				}
300
				// just a single slash
301
				value += String.fromCharCode(code);
302
				pos++;
303
				return token = SyntaxKind.Unknown;
304

305
			// numbers
306
			case CharacterCodes.minus:
307
				value += String.fromCharCode(code);
308
				pos++;
309
				if (pos === len || !isDigit(text.charCodeAt(pos))) {
310
					return token = SyntaxKind.Unknown;
311
				}
312
			// found a minus, followed by a number so
313
			// we fall through to proceed with scanning
314
			// numbers
315
			case CharacterCodes._0:
316
			case CharacterCodes._1:
317
			case CharacterCodes._2:
318
			case CharacterCodes._3:
319
			case CharacterCodes._4:
320
			case CharacterCodes._5:
321
			case CharacterCodes._6:
322
			case CharacterCodes._7:
323
			case CharacterCodes._8:
324
			case CharacterCodes._9:
325
				value += scanNumber();
326
				return token = SyntaxKind.NumericLiteral;
327
			// literals and unknown symbols
328
			default:
329
				// is a literal? Read the full word.
330
				while (pos < len && isUnknownContentCharacter(code)) {
331
					pos++;
332
					code = text.charCodeAt(pos);
333
				}
334
				if (tokenOffset !== pos) {
335
					value = text.substring(tokenOffset, pos);
336
					// keywords: true, false, null
337
					switch (value) {
338
						case 'true': return token = SyntaxKind.TrueKeyword;
339
						case 'false': return token = SyntaxKind.FalseKeyword;
340
						case 'null': return token = SyntaxKind.NullKeyword;
341
					}
342
					return token = SyntaxKind.Unknown;
343
				}
344
				// some
345
				value += String.fromCharCode(code);
346
				pos++;
347
				return token = SyntaxKind.Unknown;
348
		}
349
	}
350

351
	function isUnknownContentCharacter(code: CharacterCodes) {
352
		if (isWhiteSpace(code) || isLineBreak(code)) {
353
			return false;
354
		}
355
		switch (code) {
356
			case CharacterCodes.closeBrace:
357
			case CharacterCodes.closeBracket:
358
			case CharacterCodes.openBrace:
359
			case CharacterCodes.openBracket:
360
			case CharacterCodes.doubleQuote:
361
			case CharacterCodes.colon:
362
			case CharacterCodes.comma:
363
			case CharacterCodes.slash:
364
				return false;
365
		}
366
		return true;
367
	}
368

369

370
	function scanNextNonTrivia(): SyntaxKind {
371
		let result: SyntaxKind;
372
		do {
373
			result = scanNext();
374
		} while (result >= SyntaxKind.LineCommentTrivia && result <= SyntaxKind.Trivia);
375
		return result;
376
	}
377

378
	return {
379
		setPosition: setPosition,
380
		getPosition: () => pos,
381
		scan: ignoreTrivia ? scanNextNonTrivia : scanNext,
382
		getToken: () => token,
383
		getTokenValue: () => value,
384
		getTokenOffset: () => tokenOffset,
385
		getTokenLength: () => pos - tokenOffset,
386
		getTokenStartLine: () => lineStartOffset,
387
		getTokenStartCharacter: () => tokenOffset - prevTokenLineStartOffset,
388
		getTokenError: () => scanError,
389
	};
390
}
391

392
function isWhiteSpace(ch: number): boolean {
393
	return ch === CharacterCodes.space || ch === CharacterCodes.tab;
394
}
395

396
function isLineBreak(ch: number): boolean {
397
	return ch === CharacterCodes.lineFeed || ch === CharacterCodes.carriageReturn;
398
}
399

400
function isDigit(ch: number): boolean {
401
	return ch >= CharacterCodes._0 && ch <= CharacterCodes._9;
402
}
403

404
const enum CharacterCodes {
405
	lineFeed = 0x0A,              // \n
406
	carriageReturn = 0x0D,        // \r
407

408
	space = 0x0020,   // " "
409

410
	_0 = 0x30,
411
	_1 = 0x31,
412
	_2 = 0x32,
413
	_3 = 0x33,
414
	_4 = 0x34,
415
	_5 = 0x35,
416
	_6 = 0x36,
417
	_7 = 0x37,
418
	_8 = 0x38,
419
	_9 = 0x39,
420

421
	a = 0x61,
422
	b = 0x62,
423
	c = 0x63,
424
	d = 0x64,
425
	e = 0x65,
426
	f = 0x66,
427
	g = 0x67,
428
	h = 0x68,
429
	i = 0x69,
430
	j = 0x6A,
431
	k = 0x6B,
432
	l = 0x6C,
433
	m = 0x6D,
434
	n = 0x6E,
435
	o = 0x6F,
436
	p = 0x70,
437
	q = 0x71,
438
	r = 0x72,
439
	s = 0x73,
440
	t = 0x74,
441
	u = 0x75,
442
	v = 0x76,
443
	w = 0x77,
444
	x = 0x78,
445
	y = 0x79,
446
	z = 0x7A,
447

448
	A = 0x41,
449
	B = 0x42,
450
	C = 0x43,
451
	D = 0x44,
452
	E = 0x45,
453
	F = 0x46,
454
	G = 0x47,
455
	H = 0x48,
456
	I = 0x49,
457
	J = 0x4A,
458
	K = 0x4B,
459
	L = 0x4C,
460
	M = 0x4D,
461
	N = 0x4E,
462
	O = 0x4F,
463
	P = 0x50,
464
	Q = 0x51,
465
	R = 0x52,
466
	S = 0x53,
467
	T = 0x54,
468
	U = 0x55,
469
	V = 0x56,
470
	W = 0x57,
471
	X = 0x58,
472
	Y = 0x59,
473
	Z = 0x5a,
474

475
	asterisk = 0x2A,              // *
476
	backslash = 0x5C,             // \
477
	closeBrace = 0x7D,            // }
478
	closeBracket = 0x5D,          // ]
479
	colon = 0x3A,                 // :
480
	comma = 0x2C,                 // ,
481
	dot = 0x2E,                   // .
482
	doubleQuote = 0x22,           // "
483
	minus = 0x2D,                 // -
484
	openBrace = 0x7B,             // {
485
	openBracket = 0x5B,           // [
486
	plus = 0x2B,                  // +
487
	slash = 0x2F,                 // /
488

489
	formFeed = 0x0C,              // \f
490
	tab = 0x09,                   // \t
491
}
492

493
Product

Resources

Company