Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/editor/common/services/unicodeTextModelHighlighter.ts
3294 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { IRange, Range } from '../core/range.js';
7
import { Searcher } from '../model/textModelSearch.js';
8
import * as strings from '../../../base/common/strings.js';
9
import { IUnicodeHighlightsResult } from './editorWorker.js';
10
import { assertNever } from '../../../base/common/assert.js';
11
import { DEFAULT_WORD_REGEXP, getWordAtText } from '../core/wordHelper.js';
12
13
export class UnicodeTextModelHighlighter {
14
public static computeUnicodeHighlights(model: IUnicodeCharacterSearcherTarget, options: UnicodeHighlighterOptions, range?: IRange): IUnicodeHighlightsResult {
15
const startLine = range ? range.startLineNumber : 1;
16
const endLine = range ? range.endLineNumber : model.getLineCount();
17
18
const codePointHighlighter = new CodePointHighlighter(options);
19
20
const candidates = codePointHighlighter.getCandidateCodePoints();
21
let regex: RegExp;
22
if (candidates === 'allNonBasicAscii') {
23
regex = new RegExp('[^\\t\\n\\r\\x20-\\x7E]', 'g');
24
} else {
25
regex = new RegExp(`${buildRegExpCharClassExpr(Array.from(candidates))}`, 'g');
26
}
27
28
const searcher = new Searcher(null, regex);
29
const ranges: Range[] = [];
30
let hasMore = false;
31
let m: RegExpExecArray | null;
32
33
let ambiguousCharacterCount = 0;
34
let invisibleCharacterCount = 0;
35
let nonBasicAsciiCharacterCount = 0;
36
37
forLoop:
38
for (let lineNumber = startLine, lineCount = endLine; lineNumber <= lineCount; lineNumber++) {
39
const lineContent = model.getLineContent(lineNumber);
40
const lineLength = lineContent.length;
41
42
// Reset regex to search from the beginning
43
searcher.reset(0);
44
do {
45
m = searcher.next(lineContent);
46
if (m) {
47
let startIndex = m.index;
48
let endIndex = m.index + m[0].length;
49
50
// Extend range to entire code point
51
if (startIndex > 0) {
52
const charCodeBefore = lineContent.charCodeAt(startIndex - 1);
53
if (strings.isHighSurrogate(charCodeBefore)) {
54
startIndex--;
55
}
56
}
57
if (endIndex + 1 < lineLength) {
58
const charCodeBefore = lineContent.charCodeAt(endIndex - 1);
59
if (strings.isHighSurrogate(charCodeBefore)) {
60
endIndex++;
61
}
62
}
63
const str = lineContent.substring(startIndex, endIndex);
64
let word = getWordAtText(startIndex + 1, DEFAULT_WORD_REGEXP, lineContent, 0);
65
if (word && word.endColumn <= startIndex + 1) {
66
// The word does not include the problematic character, ignore the word
67
word = null;
68
}
69
const highlightReason = codePointHighlighter.shouldHighlightNonBasicASCII(str, word ? word.word : null);
70
71
if (highlightReason !== SimpleHighlightReason.None) {
72
if (highlightReason === SimpleHighlightReason.Ambiguous) {
73
ambiguousCharacterCount++;
74
} else if (highlightReason === SimpleHighlightReason.Invisible) {
75
invisibleCharacterCount++;
76
} else if (highlightReason === SimpleHighlightReason.NonBasicASCII) {
77
nonBasicAsciiCharacterCount++;
78
} else {
79
assertNever(highlightReason);
80
}
81
82
const MAX_RESULT_LENGTH = 1000;
83
if (ranges.length >= MAX_RESULT_LENGTH) {
84
hasMore = true;
85
break forLoop;
86
}
87
88
ranges.push(new Range(lineNumber, startIndex + 1, lineNumber, endIndex + 1));
89
}
90
}
91
} while (m);
92
}
93
return {
94
ranges,
95
hasMore,
96
ambiguousCharacterCount,
97
invisibleCharacterCount,
98
nonBasicAsciiCharacterCount
99
};
100
}
101
102
public static computeUnicodeHighlightReason(char: string, options: UnicodeHighlighterOptions): UnicodeHighlighterReason | null {
103
const codePointHighlighter = new CodePointHighlighter(options);
104
105
const reason = codePointHighlighter.shouldHighlightNonBasicASCII(char, null);
106
switch (reason) {
107
case SimpleHighlightReason.None:
108
return null;
109
case SimpleHighlightReason.Invisible:
110
return { kind: UnicodeHighlighterReasonKind.Invisible };
111
112
case SimpleHighlightReason.Ambiguous: {
113
const codePoint = char.codePointAt(0)!;
114
const primaryConfusable = codePointHighlighter.ambiguousCharacters.getPrimaryConfusable(codePoint)!;
115
const notAmbiguousInLocales =
116
strings.AmbiguousCharacters.getLocales().filter(
117
(l) =>
118
!strings.AmbiguousCharacters.getInstance(
119
new Set([...options.allowedLocales, l])
120
).isAmbiguous(codePoint)
121
);
122
return { kind: UnicodeHighlighterReasonKind.Ambiguous, confusableWith: String.fromCodePoint(primaryConfusable), notAmbiguousInLocales };
123
}
124
case SimpleHighlightReason.NonBasicASCII:
125
return { kind: UnicodeHighlighterReasonKind.NonBasicAscii };
126
}
127
}
128
}
129
130
function buildRegExpCharClassExpr(codePoints: number[], flags?: string): string {
131
const src = `[${strings.escapeRegExpCharacters(
132
codePoints.map((i) => String.fromCodePoint(i)).join('')
133
)}]`;
134
return src;
135
}
136
137
export const enum UnicodeHighlighterReasonKind {
138
Ambiguous, Invisible, NonBasicAscii
139
}
140
141
export type UnicodeHighlighterReason = {
142
kind: UnicodeHighlighterReasonKind.Ambiguous;
143
confusableWith: string;
144
notAmbiguousInLocales: string[];
145
} | {
146
kind: UnicodeHighlighterReasonKind.Invisible;
147
} | {
148
kind: UnicodeHighlighterReasonKind.NonBasicAscii;
149
};
150
151
class CodePointHighlighter {
152
private readonly allowedCodePoints: Set<number>;
153
public readonly ambiguousCharacters: strings.AmbiguousCharacters;
154
constructor(private readonly options: UnicodeHighlighterOptions) {
155
this.allowedCodePoints = new Set(options.allowedCodePoints);
156
this.ambiguousCharacters = strings.AmbiguousCharacters.getInstance(new Set(options.allowedLocales));
157
}
158
159
public getCandidateCodePoints(): Set<number> | 'allNonBasicAscii' {
160
if (this.options.nonBasicASCII) {
161
return 'allNonBasicAscii';
162
}
163
164
const set = new Set<number>();
165
166
if (this.options.invisibleCharacters) {
167
for (const cp of strings.InvisibleCharacters.codePoints) {
168
if (!isAllowedInvisibleCharacter(String.fromCodePoint(cp))) {
169
set.add(cp);
170
}
171
}
172
}
173
174
if (this.options.ambiguousCharacters) {
175
for (const cp of this.ambiguousCharacters.getConfusableCodePoints()) {
176
set.add(cp);
177
}
178
}
179
180
for (const cp of this.allowedCodePoints) {
181
set.delete(cp);
182
}
183
184
return set;
185
}
186
187
public shouldHighlightNonBasicASCII(character: string, wordContext: string | null): SimpleHighlightReason {
188
const codePoint = character.codePointAt(0)!;
189
190
if (this.allowedCodePoints.has(codePoint)) {
191
return SimpleHighlightReason.None;
192
}
193
194
if (this.options.nonBasicASCII) {
195
return SimpleHighlightReason.NonBasicASCII;
196
}
197
198
let hasBasicASCIICharacters = false;
199
let hasNonConfusableNonBasicAsciiCharacter = false;
200
if (wordContext) {
201
for (const char of wordContext) {
202
const codePoint = char.codePointAt(0)!;
203
const isBasicASCII = strings.isBasicASCII(char);
204
hasBasicASCIICharacters = hasBasicASCIICharacters || isBasicASCII;
205
206
if (
207
!isBasicASCII &&
208
!this.ambiguousCharacters.isAmbiguous(codePoint) &&
209
!strings.InvisibleCharacters.isInvisibleCharacter(codePoint)
210
) {
211
hasNonConfusableNonBasicAsciiCharacter = true;
212
}
213
}
214
}
215
216
if (
217
/* Don't allow mixing weird looking characters with ASCII */ !hasBasicASCIICharacters &&
218
/* Is there an obviously weird looking character? */ hasNonConfusableNonBasicAsciiCharacter
219
) {
220
return SimpleHighlightReason.None;
221
}
222
223
if (this.options.invisibleCharacters) {
224
// TODO check for emojis
225
if (!isAllowedInvisibleCharacter(character) && strings.InvisibleCharacters.isInvisibleCharacter(codePoint)) {
226
return SimpleHighlightReason.Invisible;
227
}
228
}
229
230
if (this.options.ambiguousCharacters) {
231
if (this.ambiguousCharacters.isAmbiguous(codePoint)) {
232
return SimpleHighlightReason.Ambiguous;
233
}
234
}
235
236
return SimpleHighlightReason.None;
237
}
238
}
239
240
function isAllowedInvisibleCharacter(character: string): boolean {
241
return character === ' ' || character === '\n' || character === '\t';
242
}
243
244
const enum SimpleHighlightReason {
245
None,
246
NonBasicASCII,
247
Invisible,
248
Ambiguous
249
}
250
251
export interface IUnicodeCharacterSearcherTarget {
252
getLineCount(): number;
253
getLineContent(lineNumber: number): string;
254
}
255
256
export interface UnicodeHighlighterOptions {
257
nonBasicASCII: boolean;
258
ambiguousCharacters: boolean;
259
invisibleCharacters: boolean;
260
includeComments: boolean;
261
includeStrings: boolean;
262
allowedCodePoints: number[];
263
allowedLocales: string[];
264
}
265
266