CoCalc -- tokenizer.ts

GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/editor/common/model/bracketPairsTextModelPart/bracketPairsTree/tokenizer.ts
³²⁹⁶ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import { NotSupportedError } from '../../../../../base/common/errors.js';
7
import { StandardTokenType, TokenMetadata } from '../../../encodedTokenAttributes.js';
8
import { IViewLineTokens } from '../../../tokens/lineTokens.js';
9
import { BracketAstNode, TextAstNode } from './ast.js';
10
import { BracketTokens, LanguageAgnosticBracketTokens } from './brackets.js';
11
import { Length, lengthAdd, lengthDiff, lengthGetColumnCountIfZeroLineCount, lengthToObj, lengthZero, toLength } from './length.js';
12
import { SmallImmutableSet } from './smallImmutableSet.js';
13

14
export interface Tokenizer {
15
	readonly offset: Length;
16
	readonly length: Length;
17

18
	read(): Token | null;
19
	peek(): Token | null;
20
	skip(length: Length): void;
21

22
	getText(): string;
23
}
24

25
export const enum TokenKind {
26
	Text = 0,
27
	OpeningBracket = 1,
28
	ClosingBracket = 2,
29
}
30

31
export type OpeningBracketId = number;
32

33
export class Token {
34
	constructor(
35
		readonly length: Length,
36
		readonly kind: TokenKind,
37
		/**
38
		 * If this token is an opening bracket, this is the id of the opening bracket.
39
		 * If this token is a closing bracket, this is the id of the first opening bracket that is closed by this bracket.
40
		 * Otherwise, it is -1.
41
		 */
42
		readonly bracketId: OpeningBracketId,
43
		/**
44
		 * If this token is an opening bracket, this just contains `bracketId`.
45
		 * If this token is a closing bracket, this lists all opening bracket ids, that it closes.
46
		 * Otherwise, it is empty.
47
		 */
48
		readonly bracketIds: SmallImmutableSet<OpeningBracketId>,
49
		readonly astNode: BracketAstNode | TextAstNode | undefined,
50
	) { }
51
}
52

53
export interface ITokenizerSource {
54
	getValue(): string;
55
	getLineCount(): number;
56
	getLineLength(lineNumber: number): number;
57

58
	tokenization: {
59
		getLineTokens(lineNumber: number): IViewLineTokens;
60
	};
61
}
62

63
export class TextBufferTokenizer implements Tokenizer {
64
	private readonly textBufferLineCount: number;
65
	private readonly textBufferLastLineLength: number;
66

67
	private readonly reader;
68

69
	constructor(
70
		private readonly textModel: ITokenizerSource,
71
		private readonly bracketTokens: LanguageAgnosticBracketTokens
72
	) {
73
		this.reader = new NonPeekableTextBufferTokenizer(this.textModel, this.bracketTokens);
74
		this._offset = lengthZero;
75
		this.didPeek = false;
76
		this.peeked = null;
77
		this.textBufferLineCount = textModel.getLineCount();
78
		this.textBufferLastLineLength = textModel.getLineLength(this.textBufferLineCount);
79
	}
80

81
	private _offset: Length;
82

83
	get offset() {
84
		return this._offset;
85
	}
86

87
	get length() {
88
		return toLength(this.textBufferLineCount - 1, this.textBufferLastLineLength);
89
	}
90

91
	getText() {
92
		return this.textModel.getValue();
93
	}
94

95
	skip(length: Length): void {
96
		this.didPeek = false;
97
		this._offset = lengthAdd(this._offset, length);
98
		const obj = lengthToObj(this._offset);
99
		this.reader.setPosition(obj.lineCount, obj.columnCount);
100
	}
101

102
	private didPeek;
103
	private peeked: Token | null;
104

105
	read(): Token | null {
106
		let token: Token | null;
107
		if (this.peeked) {
108
			this.didPeek = false;
109
			token = this.peeked;
110
		} else {
111
			token = this.reader.read();
112
		}
113
		if (token) {
114
			this._offset = lengthAdd(this._offset, token.length);
115
		}
116
		return token;
117
	}
118

119
	peek(): Token | null {
120
		if (!this.didPeek) {
121
			this.peeked = this.reader.read();
122
			this.didPeek = true;
123
		}
124
		return this.peeked;
125
	}
126
}
127

128
/**
129
 * Does not support peek.
130
*/
131
class NonPeekableTextBufferTokenizer {
132
	private readonly textBufferLineCount: number;
133
	private readonly textBufferLastLineLength: number;
134

135
	constructor(private readonly textModel: ITokenizerSource, private readonly bracketTokens: LanguageAgnosticBracketTokens) {
136
		this.textBufferLineCount = textModel.getLineCount();
137
		this.textBufferLastLineLength = textModel.getLineLength(this.textBufferLineCount);
138
	}
139

140
	private lineIdx = 0;
141
	private line: string | null = null;
142
	private lineCharOffset = 0;
143
	private lineTokens: IViewLineTokens | null = null;
144
	private lineTokenOffset = 0;
145

146
	public setPosition(lineIdx: number, column: number): void {
147
		// We must not jump into a token!
148
		if (lineIdx === this.lineIdx) {
149
			this.lineCharOffset = column;
150
			if (this.line !== null) {
151
				this.lineTokenOffset = this.lineCharOffset === 0 ? 0 : this.lineTokens!.findTokenIndexAtOffset(this.lineCharOffset);
152
			}
153
		} else {
154
			this.lineIdx = lineIdx;
155
			this.lineCharOffset = column;
156
			this.line = null;
157
		}
158
		this.peekedToken = null;
159
	}
160

161
	/** Must be a zero line token. The end of the document cannot be peeked. */
162
	private peekedToken: Token | null = null;
163

164
	public read(): Token | null {
165
		if (this.peekedToken) {
166
			const token = this.peekedToken;
167
			this.peekedToken = null;
168
			this.lineCharOffset += lengthGetColumnCountIfZeroLineCount(token.length);
169
			return token;
170
		}
171

172
		if (this.lineIdx > this.textBufferLineCount - 1 || (this.lineIdx === this.textBufferLineCount - 1 && this.lineCharOffset >= this.textBufferLastLineLength)) {
173
			// We are after the end
174
			return null;
175
		}
176

177
		if (this.line === null) {
178
			this.lineTokens = this.textModel.tokenization.getLineTokens(this.lineIdx + 1);
179
			this.line = this.lineTokens.getLineContent();
180
			this.lineTokenOffset = this.lineCharOffset === 0 ? 0 : this.lineTokens.findTokenIndexAtOffset(this.lineCharOffset);
181
		}
182

183
		const startLineIdx = this.lineIdx;
184
		const startLineCharOffset = this.lineCharOffset;
185

186
		// limits the length of text tokens.
187
		// If text tokens get too long, incremental updates will be slow
188
		let lengthHeuristic = 0;
189
		while (true) {
190
			const lineTokens = this.lineTokens!;
191
			const tokenCount = lineTokens.getCount();
192

193
			let peekedBracketToken: Token | null = null;
194

195
			if (this.lineTokenOffset < tokenCount) {
196
				const tokenMetadata = lineTokens.getMetadata(this.lineTokenOffset);
197
				while (this.lineTokenOffset + 1 < tokenCount && tokenMetadata === lineTokens.getMetadata(this.lineTokenOffset + 1)) {
198
					// Skip tokens that are identical.
199
					// Sometimes, (bracket) identifiers are split up into multiple tokens.
200
					this.lineTokenOffset++;
201
				}
202

203
				const isOther = TokenMetadata.getTokenType(tokenMetadata) === StandardTokenType.Other;
204
				const containsBracketType = TokenMetadata.containsBalancedBrackets(tokenMetadata);
205

206
				const endOffset = lineTokens.getEndOffset(this.lineTokenOffset);
207
				// Is there a bracket token next? Only consume text.
208
				if (containsBracketType && isOther && this.lineCharOffset < endOffset) {
209
					const languageId = lineTokens.getLanguageId(this.lineTokenOffset);
210
					const text = this.line.substring(this.lineCharOffset, endOffset);
211

212
					const brackets = this.bracketTokens.getSingleLanguageBracketTokens(languageId);
213
					const regexp = brackets.regExpGlobal;
214
					if (regexp) {
215
						regexp.lastIndex = 0;
216
						const match = regexp.exec(text);
217
						if (match) {
218
							peekedBracketToken = brackets.getToken(match[0])!;
219
							if (peekedBracketToken) {
220
								// Consume leading text of the token
221
								this.lineCharOffset += match.index;
222
							}
223
						}
224
					}
225
				}
226

227
				lengthHeuristic += endOffset - this.lineCharOffset;
228

229
				if (peekedBracketToken) {
230
					// Don't skip the entire token, as a single token could contain multiple brackets.
231

232
					if (startLineIdx !== this.lineIdx || startLineCharOffset !== this.lineCharOffset) {
233
						// There is text before the bracket
234
						this.peekedToken = peekedBracketToken;
235
						break;
236
					} else {
237
						// Consume the peeked token
238
						this.lineCharOffset += lengthGetColumnCountIfZeroLineCount(peekedBracketToken.length);
239
						return peekedBracketToken;
240
					}
241
				} else {
242
					// Skip the entire token, as the token contains no brackets at all.
243
					this.lineTokenOffset++;
244
					this.lineCharOffset = endOffset;
245
				}
246
			} else {
247
				if (this.lineIdx === this.textBufferLineCount - 1) {
248
					break;
249
				}
250
				this.lineIdx++;
251
				this.lineTokens = this.textModel.tokenization.getLineTokens(this.lineIdx + 1);
252
				this.lineTokenOffset = 0;
253
				this.line = this.lineTokens.getLineContent();
254
				this.lineCharOffset = 0;
255

256
				lengthHeuristic += 33; // max 1000/33 = 30 lines
257
				// This limits the amount of work to recompute min-indentation
258

259
				if (lengthHeuristic > 1000) {
260
					// only break (automatically) at the end of line.
261
					break;
262
				}
263
			}
264

265
			if (lengthHeuristic > 1500) {
266
				// Eventually break regardless of the line length so that
267
				// very long lines do not cause bad performance.
268
				// This effective limits max indentation to 500, as
269
				// indentation is not computed across multiple text nodes.
270
				break;
271
			}
272
		}
273

274
		// If a token contains some proper indentation, it also contains \n{INDENTATION+}(?!{INDENTATION}),
275
		// unless the line is too long.
276
		// Thus, the min indentation of the document is the minimum min indentation of every text node.
277
		const length = lengthDiff(startLineIdx, startLineCharOffset, this.lineIdx, this.lineCharOffset);
278
		return new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));
279
	}
280
}
281

282
export class FastTokenizer implements Tokenizer {
283
	private _offset: Length = lengthZero;
284
	private readonly tokens: readonly Token[];
285
	private idx = 0;
286

287
	constructor(private readonly text: string, brackets: BracketTokens) {
288
		const regExpStr = brackets.getRegExpStr();
289
		const regexp = regExpStr ? new RegExp(regExpStr + '|\n', 'gi') : null;
290

291
		const tokens: Token[] = [];
292

293
		let match: RegExpExecArray | null;
294
		let curLineCount = 0;
295
		let lastLineBreakOffset = 0;
296

297
		let lastTokenEndOffset = 0;
298
		let lastTokenEndLine = 0;
299

300
		const smallTextTokens0Line: Token[] = [];
301
		for (let i = 0; i < 60; i++) {
302
			smallTextTokens0Line.push(
303
				new Token(
304
					toLength(0, i), TokenKind.Text, -1, SmallImmutableSet.getEmpty(),
305
					new TextAstNode(toLength(0, i))
306
				)
307
			);
308
		}
309

310
		const smallTextTokens1Line: Token[] = [];
311
		for (let i = 0; i < 60; i++) {
312
			smallTextTokens1Line.push(
313
				new Token(
314
					toLength(1, i), TokenKind.Text, -1, SmallImmutableSet.getEmpty(),
315
					new TextAstNode(toLength(1, i))
316
				)
317
			);
318
		}
319

320
		if (regexp) {
321
			regexp.lastIndex = 0;
322
			// If a token contains indentation, it also contains \n{INDENTATION+}(?!{INDENTATION})
323
			while ((match = regexp.exec(text)) !== null) {
324
				const curOffset = match.index;
325
				const value = match[0];
326
				if (value === '\n') {
327
					curLineCount++;
328
					lastLineBreakOffset = curOffset + 1;
329
				} else {
330
					if (lastTokenEndOffset !== curOffset) {
331
						let token: Token;
332
						if (lastTokenEndLine === curLineCount) {
333
							const colCount = curOffset - lastTokenEndOffset;
334
							if (colCount < smallTextTokens0Line.length) {
335
								token = smallTextTokens0Line[colCount];
336
							} else {
337
								const length = toLength(0, colCount);
338
								token = new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));
339
							}
340
						} else {
341
							const lineCount = curLineCount - lastTokenEndLine;
342
							const colCount = curOffset - lastLineBreakOffset;
343
							if (lineCount === 1 && colCount < smallTextTokens1Line.length) {
344
								token = smallTextTokens1Line[colCount];
345
							} else {
346
								const length = toLength(lineCount, colCount);
347
								token = new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));
348
							}
349
						}
350
						tokens.push(token);
351
					}
352

353
					// value is matched by regexp, so the token must exist
354
					tokens.push(brackets.getToken(value)!);
355

356
					lastTokenEndOffset = curOffset + value.length;
357
					lastTokenEndLine = curLineCount;
358
				}
359
			}
360
		}
361

362
		const offset = text.length;
363

364
		if (lastTokenEndOffset !== offset) {
365
			const length = (lastTokenEndLine === curLineCount)
366
				? toLength(0, offset - lastTokenEndOffset)
367
				: toLength(curLineCount - lastTokenEndLine, offset - lastLineBreakOffset);
368
			tokens.push(new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length)));
369
		}
370

371
		this.length = toLength(curLineCount, offset - lastLineBreakOffset);
372
		this.tokens = tokens;
373
	}
374

375
	get offset(): Length {
376
		return this._offset;
377
	}
378

379
	readonly length: Length;
380

381
	read(): Token | null {
382
		return this.tokens[this.idx++] || null;
383
	}
384

385
	peek(): Token | null {
386
		return this.tokens[this.idx] || null;
387
	}
388

389
	skip(length: Length): void {
390
		throw new NotSupportedError();
391
	}
392

393
	getText(): string {
394
		return this.text;
395
	}
396
}
397

398
Product

Resources

Company