Path: blob/main/src/vs/editor/common/model/bracketPairsTextModelPart/bracketPairsTree/tokenizer.ts
3296 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { NotSupportedError } from '../../../../../base/common/errors.js';6import { StandardTokenType, TokenMetadata } from '../../../encodedTokenAttributes.js';7import { IViewLineTokens } from '../../../tokens/lineTokens.js';8import { BracketAstNode, TextAstNode } from './ast.js';9import { BracketTokens, LanguageAgnosticBracketTokens } from './brackets.js';10import { Length, lengthAdd, lengthDiff, lengthGetColumnCountIfZeroLineCount, lengthToObj, lengthZero, toLength } from './length.js';11import { SmallImmutableSet } from './smallImmutableSet.js';1213export interface Tokenizer {14readonly offset: Length;15readonly length: Length;1617read(): Token | null;18peek(): Token | null;19skip(length: Length): void;2021getText(): string;22}2324export const enum TokenKind {25Text = 0,26OpeningBracket = 1,27ClosingBracket = 2,28}2930export type OpeningBracketId = number;3132export class Token {33constructor(34readonly length: Length,35readonly kind: TokenKind,36/**37* If this token is an opening bracket, this is the id of the opening bracket.38* If this token is a closing bracket, this is the id of the first opening bracket that is closed by this bracket.39* Otherwise, it is -1.40*/41readonly bracketId: OpeningBracketId,42/**43* If this token is an opening bracket, this just contains `bracketId`.44* If this token is a closing bracket, this lists all opening bracket ids, that it closes.45* Otherwise, it is empty.46*/47readonly bracketIds: SmallImmutableSet<OpeningBracketId>,48readonly astNode: BracketAstNode | TextAstNode | undefined,49) { }50}5152export interface ITokenizerSource {53getValue(): string;54getLineCount(): number;55getLineLength(lineNumber: number): number;5657tokenization: {58getLineTokens(lineNumber: number): IViewLineTokens;59};60}6162export class TextBufferTokenizer implements Tokenizer {63private readonly textBufferLineCount: number;64private readonly textBufferLastLineLength: number;6566private readonly reader;6768constructor(69private readonly textModel: ITokenizerSource,70private readonly bracketTokens: LanguageAgnosticBracketTokens71) {72this.reader = new NonPeekableTextBufferTokenizer(this.textModel, this.bracketTokens);73this._offset = lengthZero;74this.didPeek = false;75this.peeked = null;76this.textBufferLineCount = textModel.getLineCount();77this.textBufferLastLineLength = textModel.getLineLength(this.textBufferLineCount);78}7980private _offset: Length;8182get offset() {83return this._offset;84}8586get length() {87return toLength(this.textBufferLineCount - 1, this.textBufferLastLineLength);88}8990getText() {91return this.textModel.getValue();92}9394skip(length: Length): void {95this.didPeek = false;96this._offset = lengthAdd(this._offset, length);97const obj = lengthToObj(this._offset);98this.reader.setPosition(obj.lineCount, obj.columnCount);99}100101private didPeek;102private peeked: Token | null;103104read(): Token | null {105let token: Token | null;106if (this.peeked) {107this.didPeek = false;108token = this.peeked;109} else {110token = this.reader.read();111}112if (token) {113this._offset = lengthAdd(this._offset, token.length);114}115return token;116}117118peek(): Token | null {119if (!this.didPeek) {120this.peeked = this.reader.read();121this.didPeek = true;122}123return this.peeked;124}125}126127/**128* Does not support peek.129*/130class NonPeekableTextBufferTokenizer {131private readonly textBufferLineCount: number;132private readonly textBufferLastLineLength: number;133134constructor(private readonly textModel: ITokenizerSource, private readonly bracketTokens: LanguageAgnosticBracketTokens) {135this.textBufferLineCount = textModel.getLineCount();136this.textBufferLastLineLength = textModel.getLineLength(this.textBufferLineCount);137}138139private lineIdx = 0;140private line: string | null = null;141private lineCharOffset = 0;142private lineTokens: IViewLineTokens | null = null;143private lineTokenOffset = 0;144145public setPosition(lineIdx: number, column: number): void {146// We must not jump into a token!147if (lineIdx === this.lineIdx) {148this.lineCharOffset = column;149if (this.line !== null) {150this.lineTokenOffset = this.lineCharOffset === 0 ? 0 : this.lineTokens!.findTokenIndexAtOffset(this.lineCharOffset);151}152} else {153this.lineIdx = lineIdx;154this.lineCharOffset = column;155this.line = null;156}157this.peekedToken = null;158}159160/** Must be a zero line token. The end of the document cannot be peeked. */161private peekedToken: Token | null = null;162163public read(): Token | null {164if (this.peekedToken) {165const token = this.peekedToken;166this.peekedToken = null;167this.lineCharOffset += lengthGetColumnCountIfZeroLineCount(token.length);168return token;169}170171if (this.lineIdx > this.textBufferLineCount - 1 || (this.lineIdx === this.textBufferLineCount - 1 && this.lineCharOffset >= this.textBufferLastLineLength)) {172// We are after the end173return null;174}175176if (this.line === null) {177this.lineTokens = this.textModel.tokenization.getLineTokens(this.lineIdx + 1);178this.line = this.lineTokens.getLineContent();179this.lineTokenOffset = this.lineCharOffset === 0 ? 0 : this.lineTokens.findTokenIndexAtOffset(this.lineCharOffset);180}181182const startLineIdx = this.lineIdx;183const startLineCharOffset = this.lineCharOffset;184185// limits the length of text tokens.186// If text tokens get too long, incremental updates will be slow187let lengthHeuristic = 0;188while (true) {189const lineTokens = this.lineTokens!;190const tokenCount = lineTokens.getCount();191192let peekedBracketToken: Token | null = null;193194if (this.lineTokenOffset < tokenCount) {195const tokenMetadata = lineTokens.getMetadata(this.lineTokenOffset);196while (this.lineTokenOffset + 1 < tokenCount && tokenMetadata === lineTokens.getMetadata(this.lineTokenOffset + 1)) {197// Skip tokens that are identical.198// Sometimes, (bracket) identifiers are split up into multiple tokens.199this.lineTokenOffset++;200}201202const isOther = TokenMetadata.getTokenType(tokenMetadata) === StandardTokenType.Other;203const containsBracketType = TokenMetadata.containsBalancedBrackets(tokenMetadata);204205const endOffset = lineTokens.getEndOffset(this.lineTokenOffset);206// Is there a bracket token next? Only consume text.207if (containsBracketType && isOther && this.lineCharOffset < endOffset) {208const languageId = lineTokens.getLanguageId(this.lineTokenOffset);209const text = this.line.substring(this.lineCharOffset, endOffset);210211const brackets = this.bracketTokens.getSingleLanguageBracketTokens(languageId);212const regexp = brackets.regExpGlobal;213if (regexp) {214regexp.lastIndex = 0;215const match = regexp.exec(text);216if (match) {217peekedBracketToken = brackets.getToken(match[0])!;218if (peekedBracketToken) {219// Consume leading text of the token220this.lineCharOffset += match.index;221}222}223}224}225226lengthHeuristic += endOffset - this.lineCharOffset;227228if (peekedBracketToken) {229// Don't skip the entire token, as a single token could contain multiple brackets.230231if (startLineIdx !== this.lineIdx || startLineCharOffset !== this.lineCharOffset) {232// There is text before the bracket233this.peekedToken = peekedBracketToken;234break;235} else {236// Consume the peeked token237this.lineCharOffset += lengthGetColumnCountIfZeroLineCount(peekedBracketToken.length);238return peekedBracketToken;239}240} else {241// Skip the entire token, as the token contains no brackets at all.242this.lineTokenOffset++;243this.lineCharOffset = endOffset;244}245} else {246if (this.lineIdx === this.textBufferLineCount - 1) {247break;248}249this.lineIdx++;250this.lineTokens = this.textModel.tokenization.getLineTokens(this.lineIdx + 1);251this.lineTokenOffset = 0;252this.line = this.lineTokens.getLineContent();253this.lineCharOffset = 0;254255lengthHeuristic += 33; // max 1000/33 = 30 lines256// This limits the amount of work to recompute min-indentation257258if (lengthHeuristic > 1000) {259// only break (automatically) at the end of line.260break;261}262}263264if (lengthHeuristic > 1500) {265// Eventually break regardless of the line length so that266// very long lines do not cause bad performance.267// This effective limits max indentation to 500, as268// indentation is not computed across multiple text nodes.269break;270}271}272273// If a token contains some proper indentation, it also contains \n{INDENTATION+}(?!{INDENTATION}),274// unless the line is too long.275// Thus, the min indentation of the document is the minimum min indentation of every text node.276const length = lengthDiff(startLineIdx, startLineCharOffset, this.lineIdx, this.lineCharOffset);277return new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));278}279}280281export class FastTokenizer implements Tokenizer {282private _offset: Length = lengthZero;283private readonly tokens: readonly Token[];284private idx = 0;285286constructor(private readonly text: string, brackets: BracketTokens) {287const regExpStr = brackets.getRegExpStr();288const regexp = regExpStr ? new RegExp(regExpStr + '|\n', 'gi') : null;289290const tokens: Token[] = [];291292let match: RegExpExecArray | null;293let curLineCount = 0;294let lastLineBreakOffset = 0;295296let lastTokenEndOffset = 0;297let lastTokenEndLine = 0;298299const smallTextTokens0Line: Token[] = [];300for (let i = 0; i < 60; i++) {301smallTextTokens0Line.push(302new Token(303toLength(0, i), TokenKind.Text, -1, SmallImmutableSet.getEmpty(),304new TextAstNode(toLength(0, i))305)306);307}308309const smallTextTokens1Line: Token[] = [];310for (let i = 0; i < 60; i++) {311smallTextTokens1Line.push(312new Token(313toLength(1, i), TokenKind.Text, -1, SmallImmutableSet.getEmpty(),314new TextAstNode(toLength(1, i))315)316);317}318319if (regexp) {320regexp.lastIndex = 0;321// If a token contains indentation, it also contains \n{INDENTATION+}(?!{INDENTATION})322while ((match = regexp.exec(text)) !== null) {323const curOffset = match.index;324const value = match[0];325if (value === '\n') {326curLineCount++;327lastLineBreakOffset = curOffset + 1;328} else {329if (lastTokenEndOffset !== curOffset) {330let token: Token;331if (lastTokenEndLine === curLineCount) {332const colCount = curOffset - lastTokenEndOffset;333if (colCount < smallTextTokens0Line.length) {334token = smallTextTokens0Line[colCount];335} else {336const length = toLength(0, colCount);337token = new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));338}339} else {340const lineCount = curLineCount - lastTokenEndLine;341const colCount = curOffset - lastLineBreakOffset;342if (lineCount === 1 && colCount < smallTextTokens1Line.length) {343token = smallTextTokens1Line[colCount];344} else {345const length = toLength(lineCount, colCount);346token = new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length));347}348}349tokens.push(token);350}351352// value is matched by regexp, so the token must exist353tokens.push(brackets.getToken(value)!);354355lastTokenEndOffset = curOffset + value.length;356lastTokenEndLine = curLineCount;357}358}359}360361const offset = text.length;362363if (lastTokenEndOffset !== offset) {364const length = (lastTokenEndLine === curLineCount)365? toLength(0, offset - lastTokenEndOffset)366: toLength(curLineCount - lastTokenEndLine, offset - lastLineBreakOffset);367tokens.push(new Token(length, TokenKind.Text, -1, SmallImmutableSet.getEmpty(), new TextAstNode(length)));368}369370this.length = toLength(curLineCount, offset - lastLineBreakOffset);371this.tokens = tokens;372}373374get offset(): Length {375return this._offset;376}377378readonly length: Length;379380read(): Token | null {381return this.tokens[this.idx++] || null;382}383384peek(): Token | null {385return this.tokens[this.idx] || null;386}387388skip(length: Length): void {389throw new NotSupportedError();390}391392getText(): string {393return this.text;394}395}396397398