Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/editor/common/model/bracketPairsTextModelPart/bracketPairsTree/tokenizer.ts
3296 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { NotSupportedError } from '../../../../../base/common/errors.js';
7
import { StandardTokenType, TokenMetadata } from '../../../encodedTokenAttributes.js';
8
import { IViewLineTokens } from '../../../tokens/lineTokens.js';
9
import { BracketAstNode, TextAstNode } from './ast.js';
10
import { BracketTokens, LanguageAgnosticBracketTokens } from './brackets.js';
11
import { Length, lengthAdd, lengthDiff, lengthGetColumnCountIfZeroLineCount, lengthToObj, lengthZero, toLength } from './length.js';
12
import { SmallImmutableSet } from './smallImmutableSet.js';
13
14
export interface Tokenizer {
15
readonly offset: Length;
16
readonly length: Length;
17
18
read(): Token | null;
19
peek(): Token | null;
20
skip(length: Length): void;
21
22
getText(): string;
23
}
24
25
export const enum TokenKind {
26
Text = 0,
27
OpeningBracket = 1,
28
ClosingBracket = 2,
29
}
30
31
export type OpeningBracketId = number;
32
33
export class Token {
34
constructor(
35
readonly length: Length,
36
readonly kind: TokenKind,
37
/**
38
* If this token is an opening bracket, this is the id of the opening bracket.
39
* If this token is a closing bracket, this is the id of the first opening bracket that is closed by this bracket.
40
* Otherwise, it is -1.
41
*/
42
readonly bracketId: OpeningBracketId,
43
/**
44
* If this token is an opening bracket, this just contains `bracketId`.
45
* If this token is a closing bracket, this lists all opening bracket ids, that it closes.
46
* Otherwise, it is empty.
47
*/
48
readonly bracketIds: SmallImmutableSet<OpeningBracketId>,
49
readonly astNode: BracketAstNode | TextAstNode | undefined,
50
) { }
51
}
52
53
export interface ITokenizerSource {
54
getValue(): string;
55
getLineCount(): number;
56
getLineLength(lineNumber: number): number;
57
58
tokenization: {
59
getLineTokens(lineNumber: number): IViewLineTokens;
60
};
61
}
62
63
export class TextBufferTokenizer implements Tokenizer {
64
private readonly textBufferLineCount: number;
65
private readonly textBufferLastLineLength: number;
66
67
private readonly reader;
68
69
constructor(
70
private readonly textModel: ITokenizerSource,
71
private readonly bracketTokens: LanguageAgnosticBracketTokens
72
) {
73
this.reader = new NonPeekableTextBufferTokenizer(this.textModel, this.bracketTokens);
74
this._offset = lengthZero;
75
this.didPeek = false;
76
this.peeked = null;
77
this.textBufferLineCount = textModel.getLineCount();
78
this.textBufferLastLineLength = textModel.getLineLength(this.textBufferLineCount);
79
}
80
81
private _offset: Length;
82
83
get offset() {
84
return this._offset;
85
}
86
87
get length() {
88
return toLength(this.textBufferLineCount - 1, this.textBufferLastLineLength);
89
}
90
91
getText() {
92
return this.textModel.getValue();
93
}
94
95
skip(length: Length): void {
96
this.didPeek = false;
97
this._offset = lengthAdd(this._offset, length);
98
const obj = lengthToObj(this._offset);
99
this.reader.setPosition(obj.lineCount, obj.columnCount);
100
}
101
102
private didPeek;
103
private peeked: Token | null;
104
105
read(): Token | null {
106
let token: Token | null;
107
if (this.peeked) {
108
this.didPeek = false;
109
token = this.peeked;
110
} else {
111
token = this.reader.read();
112
}
113
if (token) {
114
this._offset = lengthAdd(this._offset, token.length);
115
}
116
return token;
117
}
118
119
peek(): Token | null {
120
if (!this.didPeek) {
121
this.peeked = this.reader.read();
122
this.didPeek = true;
123
}
124
return this.peeked;
125
}
126
}
127
128
/**
129
* Does not support peek.
130
*/
131
class NonPeekableTextBufferTokenizer {
132
private readonly textBufferLineCount: number;
133
private readonly textBufferLastLineLength: number;
134
135
constructor(private readonly textModel: ITokenizerSource, private readonly bracketTokens: LanguageAgnosticBracketTokens) {
136
this.textBufferLineCount = textModel.getLineCount();
137
this.textBufferLastLineLength = textModel.getLineLength(this.textBufferLineCount);
138
}
139
140
private lineIdx = 0;
141
private line: string | null = null;
142
private lineCharOffset = 0;
143
private lineTokens: IViewLineTokens | null = null;
144
private lineTokenOffset = 0;
145
146
public setPosition(lineIdx: number, column: number): void {
147
// We must not jump into a token!
148
if (lineIdx === this.lineIdx) {
149
this.lineCharOffset = column;
150
if (this.line !== null) {
151
this.lineTokenOffset = this.lineCharOffset === 0 ? 0 : this.lineTokens!.findTokenIndexAtOffset(this.lineCharOffset);
152
}
153
} else {
154
this.lineIdx = lineIdx;
155
this.lineCharOffset = column;
156
this.line = null;
157
}
158
this.peekedToken = null;
159
}
160
161
/** Must be a zero line token. The end of the document cannot be peeked. */
162
private peekedToken: Token | null = null;
163
164
public read(): Token | null {
165
if (this.peekedToken) {
166
const token = this.peekedToken;
167
this.peekedToken = null;
168
this.lineCharOffset += lengthGetColumnCountIfZeroLineCount(token.length);
169
return token;
170
}
171
172
if (this.lineIdx > this.textBufferLineCount - 1 || (this.lineIdx === this.textBufferLineCount - 1 && this.lineCharOffset >= this.textBufferLastLineLength)) {
173
// We are after the end
174
return null;
175
}
176
177
if (this.line === null) {
178
this.lineTokens = this.textModel.tokenization.getLineTokens(this.lineIdx + 1);
179
this.line = this.lineTokens.getLineContent();
180
this.lineTokenOffset = this.lineCharOffset === 0 ? 0 : this.lineTokens.findTokenIndexAtOffset(this.lineCharOffset);
181
}
182
183
const startLineIdx = this.lineIdx;
184
const startLineCharOffset = this.lineCharOffset;
185
186
// limits the length of text tokens.
187
// If text tokens get too long, incremental updates will be slow
188
let lengthHeuristic = 0;
189
while (true) {
190
const lineTokens = this.lineTokens!;
191
const tokenCount = lineTokens.getCount();
192
193
let peekedBracketToken: Token | null = null;
194
195
if (this.lineTokenOffset < tokenCount) {
196
const tokenMetadata = lineTokens.getMetadata(this.lineTokenOffset);
197
while (this.lineTokenOffset + 1 < tokenCount && tokenMetadata === lineTokens.getMetadata(this.lineTokenOffset + 1)) {
198
// Skip tokens that are identical.
199
// Sometimes, (bracket) identifiers are split up into multiple tokens.
200
this.lineTokenOffset++;
201
}
202
203
const isOther = TokenMetadata.getTokenType(tokenMetadata) === StandardTokenType.Other;
204
const containsBracketType = TokenMetadata.containsBalancedBrackets(tokenMetadata);
205
206
const endOffset = lineTokens.getEndOffset(this.lineTokenOffset);
207
// Is there a bracket token next? Only consume text.
208
if (containsBracketType && isOther && this.lineCharOffset < endOffset) {
209
const languageId = lineTokens.getLanguageId(this.lineTokenOffset);
210
const text = this.line.substring(this.lineCharOffset, endOffset);
211
212
const brackets = this.bracketTokens.getSingleLanguageBracketTokens(languageId);
213
const regexp = brackets.regExpGlobal;
214
if (regexp) {
215
regexp.lastIndex = 0;
216
const match = regexp.exec(text);
217
if (match) {
218
peekedBracketToken = brackets.getToken(match[0])!;
219
if (peekedBracketToken) {
220
// Consume leading text of the token
221
this.lineCharOffset += match.index;
222
}
223
}
224
}
225
}
226
227
lengthHeuristic += endOffset - this.lineCharOffset;
228
229
if (peekedBracketToken) {
230
// Don't skip the entire token, as a single token could contain multiple brackets.
231
232
if (startLineIdx !== this.lineIdx || startLineCharOffset !== this.lineCharOffset) {
233
// There is text before the bracket
234
this.peekedToken = peekedBracketToken;
235
break;
236
} else {
237
// Consume the peeked token
238
this.lineCharOffset += lengthGetColumnCountIfZeroLineCount(peekedBracketToken.length);
239
return peekedBracketToken;
240
}
241
} else {
242
// Skip the entire token, as the token contains no brackets at all.
243
this.lineTokenOffset++;
244
this.lineCharOffset = endOffset;
245
}
246
} else {
247
if (this.lineIdx === this.textBufferLineCount - 1) {
248
break;
249
}
250
this.lineIdx++;
251
this.lineTokens = this.textModel.tokenization.getLineTokens(this.lineIdx + 1);
252
this.lineTokenOffset = 0;
253
this.line = this.lineTokens.getLineContent();
254
this.lineCharOffset = 0;
255
256
lengthHeuristic += 33; // max 1000/33 = 30 lines
257
// This limits the amount of work to recompute min-indentation
258
259
if (lengthHeuristic > 1000) {
260
// only break (automatically) at the end of line.
261
break;
262
}
263
}
264
265
if (lengthHeuristic > 1500) {
266
// Eventually break regardless of the line length so that
267
// very long lines do not cause bad performance.
268
// This effective limits max indentation to 500, as
269
// indentation is not computed across multiple text nodes.
270
break;
271
}
272
}
273
274
// If a token contains some proper indentation, it also contains \n{INDENTATION+}(?!{INDENTATION}),
275
// unless the line is too long.
276
// Thus, the min indentation of the document is the minimum min indentation of every text node.
277
const length = lengthDiff(startLineIdx, startLineCharOffset, this.lineIdx, this.lineCharOffset);
278
return new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));
279
}
280
}
281
282
export class FastTokenizer implements Tokenizer {
283
private _offset: Length = lengthZero;
284
private readonly tokens: readonly Token[];
285
private idx = 0;
286
287
constructor(private readonly text: string, brackets: BracketTokens) {
288
const regExpStr = brackets.getRegExpStr();
289
const regexp = regExpStr ? new RegExp(regExpStr + '|\n', 'gi') : null;
290
291
const tokens: Token[] = [];
292
293
let match: RegExpExecArray | null;
294
let curLineCount = 0;
295
let lastLineBreakOffset = 0;
296
297
let lastTokenEndOffset = 0;
298
let lastTokenEndLine = 0;
299
300
const smallTextTokens0Line: Token[] = [];
301
for (let i = 0; i < 60; i++) {
302
smallTextTokens0Line.push(
303
new Token(
304
toLength(0, i), TokenKind.Text, -1, SmallImmutableSet.getEmpty(),
305
new TextAstNode(toLength(0, i))
306
)
307
);
308
}
309
310
const smallTextTokens1Line: Token[] = [];
311
for (let i = 0; i < 60; i++) {
312
smallTextTokens1Line.push(
313
new Token(
314
toLength(1, i), TokenKind.Text, -1, SmallImmutableSet.getEmpty(),
315
new TextAstNode(toLength(1, i))
316
)
317
);
318
}
319
320
if (regexp) {
321
regexp.lastIndex = 0;
322
// If a token contains indentation, it also contains \n{INDENTATION+}(?!{INDENTATION})
323
while ((match = regexp.exec(text)) !== null) {
324
const curOffset = match.index;
325
const value = match[0];
326
if (value === '\n') {
327
curLineCount++;
328
lastLineBreakOffset = curOffset + 1;
329
} else {
330
if (lastTokenEndOffset !== curOffset) {
331
let token: Token;
332
if (lastTokenEndLine === curLineCount) {
333
const colCount = curOffset - lastTokenEndOffset;
334
if (colCount < smallTextTokens0Line.length) {
335
token = smallTextTokens0Line[colCount];
336
} else {
337
const length = toLength(0, colCount);
338
token = new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));
339
}
340
} else {
341
const lineCount = curLineCount - lastTokenEndLine;
342
const colCount = curOffset - lastLineBreakOffset;
343
if (lineCount === 1 && colCount < smallTextTokens1Line.length) {
344
token = smallTextTokens1Line[colCount];
345
} else {
346
const length = toLength(lineCount, colCount);
347
token = new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));
348
}
349
}
350
tokens.push(token);
351
}
352
353
// value is matched by regexp, so the token must exist
354
tokens.push(brackets.getToken(value)!);
355
356
lastTokenEndOffset = curOffset + value.length;
357
lastTokenEndLine = curLineCount;
358
}
359
}
360
}
361
362
const offset = text.length;
363
364
if (lastTokenEndOffset !== offset) {
365
const length = (lastTokenEndLine === curLineCount)
366
? toLength(0, offset - lastTokenEndOffset)
367
: toLength(curLineCount - lastTokenEndLine, offset - lastLineBreakOffset);
368
tokens.push(new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length)));
369
}
370
371
this.length = toLength(curLineCount, offset - lastLineBreakOffset);
372
this.tokens = tokens;
373
}
374
375
get offset(): Length {
376
return this._offset;
377
}
378
379
readonly length: Length;
380
381
read(): Token | null {
382
return this.tokens[this.idx++] || null;
383
}
384
385
peek(): Token | null {
386
return this.tokens[this.idx] || null;
387
}
388
389
skip(length: Length): void {
390
throw new NotSupportedError();
391
}
392
393
getText(): string {
394
return this.text;
395
}
396
}
397
398