CoCalc -- scanner.ts

GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/platform/contextkey/common/scanner.ts
³²⁹⁶ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import { CharCode } from '../../../base/common/charCode.js';
7
import { illegalState } from '../../../base/common/errors.js';
8
import { localize } from '../../../nls.js';
9

10
export const enum TokenType {
11
	LParen,
12
	RParen,
13
	Neg,
14
	Eq,
15
	NotEq,
16
	Lt,
17
	LtEq,
18
	Gt,
19
	GtEq,
20
	RegexOp,
21
	RegexStr,
22
	True,
23
	False,
24
	In,
25
	Not,
26
	And,
27
	Or,
28
	Str,
29
	QuotedStr,
30
	Error,
31
	EOF,
32
}
33

34
export type Token =
35
	| { type: TokenType.LParen; offset: number }
36
	| { type: TokenType.RParen; offset: number }
37
	| { type: TokenType.Neg; offset: number }
38
	| { type: TokenType.Eq; offset: number; isTripleEq: boolean }
39
	| { type: TokenType.NotEq; offset: number; isTripleEq: boolean }
40
	| { type: TokenType.Lt; offset: number }
41
	| { type: TokenType.LtEq; offset: number }
42
	| { type: TokenType.Gt; offset: number }
43
	| { type: TokenType.GtEq; offset: number }
44
	| { type: TokenType.RegexOp; offset: number }
45
	| { type: TokenType.RegexStr; offset: number; lexeme: string }
46
	| { type: TokenType.True; offset: number }
47
	| { type: TokenType.False; offset: number }
48
	| { type: TokenType.In; offset: number }
49
	| { type: TokenType.Not; offset: number }
50
	| { type: TokenType.And; offset: number }
51
	| { type: TokenType.Or; offset: number }
52
	| { type: TokenType.Str; offset: number; lexeme: string }
53
	| { type: TokenType.QuotedStr; offset: number; lexeme: string }
54
	| { type: TokenType.Error; offset: number; lexeme: string }
55
	| { type: TokenType.EOF; offset: number };
56

57
type KeywordTokenType = TokenType.Not | TokenType.In | TokenType.False | TokenType.True;
58
type TokenTypeWithoutLexeme =
59
	TokenType.LParen |
60
	TokenType.RParen |
61
	TokenType.Neg |
62
	TokenType.Lt |
63
	TokenType.LtEq |
64
	TokenType.Gt |
65
	TokenType.GtEq |
66
	TokenType.RegexOp |
67
	TokenType.True |
68
	TokenType.False |
69
	TokenType.In |
70
	TokenType.Not |
71
	TokenType.And |
72
	TokenType.Or |
73
	TokenType.EOF;
74

75
/**
76
 * Example:
77
 * `foo == bar'` - note how single quote doesn't have a corresponding closing quote,
78
 * so it's reported as unexpected
79
 */
80
export type LexingError = {
81
	offset: number; /** note that this doesn't take into account escape characters from the original encoding of the string, e.g., within an extension manifest file's JSON encoding  */
82
	lexeme: string;
83
	additionalInfo?: string;
84
};
85

86
function hintDidYouMean(...meant: string[]) {
87
	switch (meant.length) {
88
		case 1:
89
			return localize('contextkey.scanner.hint.didYouMean1', "Did you mean {0}?", meant[0]);
90
		case 2:
91
			return localize('contextkey.scanner.hint.didYouMean2', "Did you mean {0} or {1}?", meant[0], meant[1]);
92
		case 3:
93
			return localize('contextkey.scanner.hint.didYouMean3', "Did you mean {0}, {1} or {2}?", meant[0], meant[1], meant[2]);
94
		default: // we just don't expect that many
95
			return undefined;
96
	}
97
}
98

99
const hintDidYouForgetToOpenOrCloseQuote = localize('contextkey.scanner.hint.didYouForgetToOpenOrCloseQuote', "Did you forget to open or close the quote?");
100
const hintDidYouForgetToEscapeSlash = localize('contextkey.scanner.hint.didYouForgetToEscapeSlash', "Did you forget to escape the '/' (slash) character? Put two backslashes before it to escape, e.g., '\\\\/\'.");
101

102
/**
103
 * A simple scanner for context keys.
104
 *
105
 * Example:
106
 *
107
 * ```ts
108
 * const scanner = new Scanner().reset('resourceFileName =~ /docker/ && !config.docker.enabled');
109
 * const tokens = [...scanner];
110
 * if (scanner.errorTokens.length > 0) {
111
 *     scanner.errorTokens.forEach(err => console.error(`Unexpected token at ${err.offset}: ${err.lexeme}\nHint: ${err.additional}`));
112
 * } else {
113
 *     // process tokens
114
 * }
115
 * ```
116
 */
117
export class Scanner {
118

119
	static getLexeme(token: Token): string {
120
		switch (token.type) {
121
			case TokenType.LParen:
122
				return '(';
123
			case TokenType.RParen:
124
				return ')';
125
			case TokenType.Neg:
126
				return '!';
127
			case TokenType.Eq:
128
				return token.isTripleEq ? '===' : '==';
129
			case TokenType.NotEq:
130
				return token.isTripleEq ? '!==' : '!=';
131
			case TokenType.Lt:
132
				return '<';
133
			case TokenType.LtEq:
134
				return '<=';
135
			case TokenType.Gt:
136
				return '>=';
137
			case TokenType.GtEq:
138
				return '>=';
139
			case TokenType.RegexOp:
140
				return '=~';
141
			case TokenType.RegexStr:
142
				return token.lexeme;
143
			case TokenType.True:
144
				return 'true';
145
			case TokenType.False:
146
				return 'false';
147
			case TokenType.In:
148
				return 'in';
149
			case TokenType.Not:
150
				return 'not';
151
			case TokenType.And:
152
				return '&&';
153
			case TokenType.Or:
154
				return '||';
155
			case TokenType.Str:
156
				return token.lexeme;
157
			case TokenType.QuotedStr:
158
				return token.lexeme;
159
			case TokenType.Error:
160
				return token.lexeme;
161
			case TokenType.EOF:
162
				return 'EOF';
163
			default:
164
				throw illegalState(`unhandled token type: ${JSON.stringify(token)}; have you forgotten to add a case?`);
165
		}
166
	}
167

168
	private static _regexFlags = new Set(['i', 'g', 's', 'm', 'y', 'u'].map(ch => ch.charCodeAt(0)));
169

170
	private static _keywords = new Map<string, KeywordTokenType>([
171
		['not', TokenType.Not],
172
		['in', TokenType.In],
173
		['false', TokenType.False],
174
		['true', TokenType.True],
175
	]);
176

177
	private _input: string = '';
178
	private _start: number = 0;
179
	private _current: number = 0;
180
	private _tokens: Token[] = [];
181
	private _errors: LexingError[] = [];
182

183
	get errors(): Readonly<LexingError[]> {
184
		return this._errors;
185
	}
186

187
	reset(value: string) {
188
		this._input = value;
189

190
		this._start = 0;
191
		this._current = 0;
192
		this._tokens = [];
193
		this._errors = [];
194

195
		return this;
196
	}
197

198
	scan() {
199
		while (!this._isAtEnd()) {
200

201
			this._start = this._current;
202

203
			const ch = this._advance();
204
			switch (ch) {
205
				case CharCode.OpenParen: this._addToken(TokenType.LParen); break;
206
				case CharCode.CloseParen: this._addToken(TokenType.RParen); break;
207

208
				case CharCode.ExclamationMark:
209
					if (this._match(CharCode.Equals)) {
210
						const isTripleEq = this._match(CharCode.Equals); // eat last `=` if `!==`
211
						this._tokens.push({ type: TokenType.NotEq, offset: this._start, isTripleEq });
212
					} else {
213
						this._addToken(TokenType.Neg);
214
					}
215
					break;
216

217
				case CharCode.SingleQuote: this._quotedString(); break;
218
				case CharCode.Slash: this._regex(); break;
219

220
				case CharCode.Equals:
221
					if (this._match(CharCode.Equals)) { // support `==`
222
						const isTripleEq = this._match(CharCode.Equals); // eat last `=` if `===`
223
						this._tokens.push({ type: TokenType.Eq, offset: this._start, isTripleEq });
224
					} else if (this._match(CharCode.Tilde)) {
225
						this._addToken(TokenType.RegexOp);
226
					} else {
227
						this._error(hintDidYouMean('==', '=~'));
228
					}
229
					break;
230

231
				case CharCode.LessThan: this._addToken(this._match(CharCode.Equals) ? TokenType.LtEq : TokenType.Lt); break;
232

233
				case CharCode.GreaterThan: this._addToken(this._match(CharCode.Equals) ? TokenType.GtEq : TokenType.Gt); break;
234

235
				case CharCode.Ampersand:
236
					if (this._match(CharCode.Ampersand)) {
237
						this._addToken(TokenType.And);
238
					} else {
239
						this._error(hintDidYouMean('&&'));
240
					}
241
					break;
242

243
				case CharCode.Pipe:
244
					if (this._match(CharCode.Pipe)) {
245
						this._addToken(TokenType.Or);
246
					} else {
247
						this._error(hintDidYouMean('||'));
248
					}
249
					break;
250

251
				// TODO@ulugbekna: 1) rewrite using a regex 2) reconsider what characters are considered whitespace, including unicode, nbsp, etc.
252
				case CharCode.Space:
253
				case CharCode.CarriageReturn:
254
				case CharCode.Tab:
255
				case CharCode.LineFeed:
256
				case CharCode.NoBreakSpace: // &nbsp
257
					break;
258

259
				default:
260
					this._string();
261
			}
262
		}
263

264
		this._start = this._current;
265
		this._addToken(TokenType.EOF);
266

267
		return Array.from(this._tokens);
268
	}
269

270
	private _match(expected: number): boolean {
271
		if (this._isAtEnd()) {
272
			return false;
273
		}
274
		if (this._input.charCodeAt(this._current) !== expected) {
275
			return false;
276
		}
277
		this._current++;
278
		return true;
279
	}
280

281
	private _advance(): number {
282
		return this._input.charCodeAt(this._current++);
283
	}
284

285
	private _peek(): number {
286
		return this._isAtEnd() ? CharCode.Null : this._input.charCodeAt(this._current);
287
	}
288

289
	private _addToken(type: TokenTypeWithoutLexeme) {
290
		this._tokens.push({ type, offset: this._start });
291
	}
292

293
	private _error(additional?: string) {
294
		const offset = this._start;
295
		const lexeme = this._input.substring(this._start, this._current);
296
		const errToken: Token = { type: TokenType.Error, offset: this._start, lexeme };
297
		this._errors.push({ offset, lexeme, additionalInfo: additional });
298
		this._tokens.push(errToken);
299
	}
300

301
	// u - unicode, y - sticky // TODO@ulugbekna: we accept double quotes as part of the string rather than as a delimiter (to preserve old parser's behavior)
302
	private stringRe = /[a-zA-Z0-9_<>\-\./\\:\*\?\+\[\]\^,#@;"%\$\p{L}-]+/uy;
303
	private _string() {
304
		this.stringRe.lastIndex = this._start;
305
		const match = this.stringRe.exec(this._input);
306
		if (match) {
307
			this._current = this._start + match[0].length;
308
			const lexeme = this._input.substring(this._start, this._current);
309
			const keyword = Scanner._keywords.get(lexeme);
310
			if (keyword) {
311
				this._addToken(keyword);
312
			} else {
313
				this._tokens.push({ type: TokenType.Str, lexeme, offset: this._start });
314
			}
315
		}
316
	}
317

318
	// captures the lexeme without the leading and trailing '
319
	private _quotedString() {
320
		while (this._peek() !== CharCode.SingleQuote && !this._isAtEnd()) { // TODO@ulugbekna: add support for escaping ' ?
321
			this._advance();
322
		}
323

324
		if (this._isAtEnd()) {
325
			this._error(hintDidYouForgetToOpenOrCloseQuote);
326
			return;
327
		}
328

329
		// consume the closing '
330
		this._advance();
331

332
		this._tokens.push({ type: TokenType.QuotedStr, lexeme: this._input.substring(this._start + 1, this._current - 1), offset: this._start + 1 });
333
	}
334

335
	/*
336
	 * Lexing a regex expression: /.../[igsmyu]*
337
	 * Based on https://github.com/microsoft/TypeScript/blob/9247ef115e617805983740ba795d7a8164babf89/src/compiler/scanner.ts#L2129-L2181
338
	 *
339
	 * Note that we want slashes within a regex to be escaped, e.g., /file:\\/\\/\\// should match `file:///`
340
	 */
341
	private _regex() {
342
		let p = this._current;
343

344
		let inEscape = false;
345
		let inCharacterClass = false;
346
		while (true) {
347
			if (p >= this._input.length) {
348
				this._current = p;
349
				this._error(hintDidYouForgetToEscapeSlash);
350
				return;
351
			}
352

353
			const ch = this._input.charCodeAt(p);
354

355
			if (inEscape) { // parsing an escape character
356
				inEscape = false;
357
			} else if (ch === CharCode.Slash && !inCharacterClass) { // end of regex
358
				p++;
359
				break;
360
			} else if (ch === CharCode.OpenSquareBracket) {
361
				inCharacterClass = true;
362
			} else if (ch === CharCode.Backslash) {
363
				inEscape = true;
364
			} else if (ch === CharCode.CloseSquareBracket) {
365
				inCharacterClass = false;
366
			}
367
			p++;
368
		}
369

370
		// Consume flags // TODO@ulugbekna: use regex instead
371
		while (p < this._input.length && Scanner._regexFlags.has(this._input.charCodeAt(p))) {
372
			p++;
373
		}
374

375
		this._current = p;
376

377
		const lexeme = this._input.substring(this._start, this._current);
378
		this._tokens.push({ type: TokenType.RegexStr, lexeme, offset: this._start });
379
	}
380

381
	private _isAtEnd() {
382
		return this._current >= this._input.length;
383
	}
384
}
385

386
Product

Resources

Company