Path: blob/main/src/vs/editor/common/services/unicodeTextModelHighlighter.ts
3294 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { IRange, Range } from '../core/range.js';6import { Searcher } from '../model/textModelSearch.js';7import * as strings from '../../../base/common/strings.js';8import { IUnicodeHighlightsResult } from './editorWorker.js';9import { assertNever } from '../../../base/common/assert.js';10import { DEFAULT_WORD_REGEXP, getWordAtText } from '../core/wordHelper.js';1112export class UnicodeTextModelHighlighter {13public static computeUnicodeHighlights(model: IUnicodeCharacterSearcherTarget, options: UnicodeHighlighterOptions, range?: IRange): IUnicodeHighlightsResult {14const startLine = range ? range.startLineNumber : 1;15const endLine = range ? range.endLineNumber : model.getLineCount();1617const codePointHighlighter = new CodePointHighlighter(options);1819const candidates = codePointHighlighter.getCandidateCodePoints();20let regex: RegExp;21if (candidates === 'allNonBasicAscii') {22regex = new RegExp('[^\\t\\n\\r\\x20-\\x7E]', 'g');23} else {24regex = new RegExp(`${buildRegExpCharClassExpr(Array.from(candidates))}`, 'g');25}2627const searcher = new Searcher(null, regex);28const ranges: Range[] = [];29let hasMore = false;30let m: RegExpExecArray | null;3132let ambiguousCharacterCount = 0;33let invisibleCharacterCount = 0;34let nonBasicAsciiCharacterCount = 0;3536forLoop:37for (let lineNumber = startLine, lineCount = endLine; lineNumber <= lineCount; lineNumber++) {38const lineContent = model.getLineContent(lineNumber);39const lineLength = lineContent.length;4041// Reset regex to search from the beginning42searcher.reset(0);43do {44m = searcher.next(lineContent);45if (m) {46let startIndex = m.index;47let endIndex = m.index + m[0].length;4849// Extend range to entire code point50if (startIndex > 0) {51const charCodeBefore = lineContent.charCodeAt(startIndex - 1);52if (strings.isHighSurrogate(charCodeBefore)) {53startIndex--;54}55}56if (endIndex + 1 < lineLength) {57const charCodeBefore = lineContent.charCodeAt(endIndex - 1);58if (strings.isHighSurrogate(charCodeBefore)) {59endIndex++;60}61}62const str = lineContent.substring(startIndex, endIndex);63let word = getWordAtText(startIndex + 1, DEFAULT_WORD_REGEXP, lineContent, 0);64if (word && word.endColumn <= startIndex + 1) {65// The word does not include the problematic character, ignore the word66word = null;67}68const highlightReason = codePointHighlighter.shouldHighlightNonBasicASCII(str, word ? word.word : null);6970if (highlightReason !== SimpleHighlightReason.None) {71if (highlightReason === SimpleHighlightReason.Ambiguous) {72ambiguousCharacterCount++;73} else if (highlightReason === SimpleHighlightReason.Invisible) {74invisibleCharacterCount++;75} else if (highlightReason === SimpleHighlightReason.NonBasicASCII) {76nonBasicAsciiCharacterCount++;77} else {78assertNever(highlightReason);79}8081const MAX_RESULT_LENGTH = 1000;82if (ranges.length >= MAX_RESULT_LENGTH) {83hasMore = true;84break forLoop;85}8687ranges.push(new Range(lineNumber, startIndex + 1, lineNumber, endIndex + 1));88}89}90} while (m);91}92return {93ranges,94hasMore,95ambiguousCharacterCount,96invisibleCharacterCount,97nonBasicAsciiCharacterCount98};99}100101public static computeUnicodeHighlightReason(char: string, options: UnicodeHighlighterOptions): UnicodeHighlighterReason | null {102const codePointHighlighter = new CodePointHighlighter(options);103104const reason = codePointHighlighter.shouldHighlightNonBasicASCII(char, null);105switch (reason) {106case SimpleHighlightReason.None:107return null;108case SimpleHighlightReason.Invisible:109return { kind: UnicodeHighlighterReasonKind.Invisible };110111case SimpleHighlightReason.Ambiguous: {112const codePoint = char.codePointAt(0)!;113const primaryConfusable = codePointHighlighter.ambiguousCharacters.getPrimaryConfusable(codePoint)!;114const notAmbiguousInLocales =115strings.AmbiguousCharacters.getLocales().filter(116(l) =>117!strings.AmbiguousCharacters.getInstance(118new Set([...options.allowedLocales, l])119).isAmbiguous(codePoint)120);121return { kind: UnicodeHighlighterReasonKind.Ambiguous, confusableWith: String.fromCodePoint(primaryConfusable), notAmbiguousInLocales };122}123case SimpleHighlightReason.NonBasicASCII:124return { kind: UnicodeHighlighterReasonKind.NonBasicAscii };125}126}127}128129function buildRegExpCharClassExpr(codePoints: number[], flags?: string): string {130const src = `[${strings.escapeRegExpCharacters(131codePoints.map((i) => String.fromCodePoint(i)).join('')132)}]`;133return src;134}135136export const enum UnicodeHighlighterReasonKind {137Ambiguous, Invisible, NonBasicAscii138}139140export type UnicodeHighlighterReason = {141kind: UnicodeHighlighterReasonKind.Ambiguous;142confusableWith: string;143notAmbiguousInLocales: string[];144} | {145kind: UnicodeHighlighterReasonKind.Invisible;146} | {147kind: UnicodeHighlighterReasonKind.NonBasicAscii;148};149150class CodePointHighlighter {151private readonly allowedCodePoints: Set<number>;152public readonly ambiguousCharacters: strings.AmbiguousCharacters;153constructor(private readonly options: UnicodeHighlighterOptions) {154this.allowedCodePoints = new Set(options.allowedCodePoints);155this.ambiguousCharacters = strings.AmbiguousCharacters.getInstance(new Set(options.allowedLocales));156}157158public getCandidateCodePoints(): Set<number> | 'allNonBasicAscii' {159if (this.options.nonBasicASCII) {160return 'allNonBasicAscii';161}162163const set = new Set<number>();164165if (this.options.invisibleCharacters) {166for (const cp of strings.InvisibleCharacters.codePoints) {167if (!isAllowedInvisibleCharacter(String.fromCodePoint(cp))) {168set.add(cp);169}170}171}172173if (this.options.ambiguousCharacters) {174for (const cp of this.ambiguousCharacters.getConfusableCodePoints()) {175set.add(cp);176}177}178179for (const cp of this.allowedCodePoints) {180set.delete(cp);181}182183return set;184}185186public shouldHighlightNonBasicASCII(character: string, wordContext: string | null): SimpleHighlightReason {187const codePoint = character.codePointAt(0)!;188189if (this.allowedCodePoints.has(codePoint)) {190return SimpleHighlightReason.None;191}192193if (this.options.nonBasicASCII) {194return SimpleHighlightReason.NonBasicASCII;195}196197let hasBasicASCIICharacters = false;198let hasNonConfusableNonBasicAsciiCharacter = false;199if (wordContext) {200for (const char of wordContext) {201const codePoint = char.codePointAt(0)!;202const isBasicASCII = strings.isBasicASCII(char);203hasBasicASCIICharacters = hasBasicASCIICharacters || isBasicASCII;204205if (206!isBasicASCII &&207!this.ambiguousCharacters.isAmbiguous(codePoint) &&208!strings.InvisibleCharacters.isInvisibleCharacter(codePoint)209) {210hasNonConfusableNonBasicAsciiCharacter = true;211}212}213}214215if (216/* Don't allow mixing weird looking characters with ASCII */ !hasBasicASCIICharacters &&217/* Is there an obviously weird looking character? */ hasNonConfusableNonBasicAsciiCharacter218) {219return SimpleHighlightReason.None;220}221222if (this.options.invisibleCharacters) {223// TODO check for emojis224if (!isAllowedInvisibleCharacter(character) && strings.InvisibleCharacters.isInvisibleCharacter(codePoint)) {225return SimpleHighlightReason.Invisible;226}227}228229if (this.options.ambiguousCharacters) {230if (this.ambiguousCharacters.isAmbiguous(codePoint)) {231return SimpleHighlightReason.Ambiguous;232}233}234235return SimpleHighlightReason.None;236}237}238239function isAllowedInvisibleCharacter(character: string): boolean {240return character === ' ' || character === '\n' || character === '\t';241}242243const enum SimpleHighlightReason {244None,245NonBasicASCII,246Invisible,247Ambiguous248}249250export interface IUnicodeCharacterSearcherTarget {251getLineCount(): number;252getLineContent(lineNumber: number): string;253}254255export interface UnicodeHighlighterOptions {256nonBasicASCII: boolean;257ambiguousCharacters: boolean;258invisibleCharacters: boolean;259includeComments: boolean;260includeStrings: boolean;261allowedCodePoints: number[];262allowedLocales: string[];263}264265266