Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/workbench/services/languageDetection/browser/languageDetectionWebWorker.ts
3296 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import type { ModelOperations, ModelResult } from '@vscode/vscode-languagedetection';
7
import { importAMDNodeModule } from '../../../../amdX.js';
8
import { StopWatch } from '../../../../base/common/stopwatch.js';
9
import { IWebWorkerServerRequestHandler, IWebWorkerServer } from '../../../../base/common/worker/webWorker.js';
10
import { LanguageDetectionWorkerHost, ILanguageDetectionWorker } from './languageDetectionWorker.protocol.js';
11
import { WorkerTextModelSyncServer } from '../../../../editor/common/services/textModelSync/textModelSync.impl.js';
12
13
type RegexpModel = { detect: (inp: string, langBiases: Record<string, number>, supportedLangs?: string[]) => string | undefined };
14
15
export function create(workerServer: IWebWorkerServer): IWebWorkerServerRequestHandler {
16
return new LanguageDetectionWorker(workerServer);
17
}
18
19
/**
20
* @internal
21
*/
22
export class LanguageDetectionWorker implements ILanguageDetectionWorker {
23
_requestHandlerBrand: any;
24
25
private static readonly expectedRelativeConfidence = 0.2;
26
private static readonly positiveConfidenceCorrectionBucket1 = 0.05;
27
private static readonly positiveConfidenceCorrectionBucket2 = 0.025;
28
private static readonly negativeConfidenceCorrection = 0.5;
29
30
private readonly _workerTextModelSyncServer = new WorkerTextModelSyncServer();
31
32
private readonly _host: LanguageDetectionWorkerHost;
33
private _regexpModel: RegexpModel | undefined;
34
private _regexpLoadFailed: boolean = false;
35
36
private _modelOperations: ModelOperations | undefined;
37
private _loadFailed: boolean = false;
38
39
private modelIdToCoreId = new Map<string, string | undefined>();
40
41
constructor(workerServer: IWebWorkerServer) {
42
this._host = LanguageDetectionWorkerHost.getChannel(workerServer);
43
this._workerTextModelSyncServer.bindToServer(workerServer);
44
}
45
46
public async $detectLanguage(uri: string, langBiases: Record<string, number> | undefined, preferHistory: boolean, supportedLangs?: string[]): Promise<string | undefined> {
47
const languages: string[] = [];
48
const confidences: number[] = [];
49
const stopWatch = new StopWatch();
50
const documentTextSample = this.getTextForDetection(uri);
51
if (!documentTextSample) { return; }
52
53
const neuralResolver = async () => {
54
for await (const language of this.detectLanguagesImpl(documentTextSample)) {
55
if (!this.modelIdToCoreId.has(language.languageId)) {
56
this.modelIdToCoreId.set(language.languageId, await this._host.$getLanguageId(language.languageId));
57
}
58
const coreId = this.modelIdToCoreId.get(language.languageId);
59
if (coreId && (!supportedLangs?.length || supportedLangs.includes(coreId))) {
60
languages.push(coreId);
61
confidences.push(language.confidence);
62
}
63
}
64
stopWatch.stop();
65
66
if (languages.length) {
67
this._host.$sendTelemetryEvent(languages, confidences, stopWatch.elapsed());
68
return languages[0];
69
}
70
return undefined;
71
};
72
73
const historicalResolver = async () => this.runRegexpModel(documentTextSample, langBiases ?? {}, supportedLangs);
74
75
if (preferHistory) {
76
const history = await historicalResolver();
77
if (history) { return history; }
78
const neural = await neuralResolver();
79
if (neural) { return neural; }
80
} else {
81
const neural = await neuralResolver();
82
if (neural) { return neural; }
83
const history = await historicalResolver();
84
if (history) { return history; }
85
}
86
87
return undefined;
88
}
89
90
private getTextForDetection(uri: string): string | undefined {
91
const editorModel = this._workerTextModelSyncServer.getModel(uri);
92
if (!editorModel) { return; }
93
94
const end = editorModel.positionAt(10000);
95
const content = editorModel.getValueInRange({
96
startColumn: 1,
97
startLineNumber: 1,
98
endColumn: end.column,
99
endLineNumber: end.lineNumber
100
});
101
return content;
102
}
103
104
private async getRegexpModel(): Promise<RegexpModel | undefined> {
105
if (this._regexpLoadFailed) {
106
return;
107
}
108
if (this._regexpModel) {
109
return this._regexpModel;
110
}
111
const uri: string = await this._host.$getRegexpModelUri();
112
try {
113
this._regexpModel = await importAMDNodeModule(uri, '') as RegexpModel;
114
return this._regexpModel;
115
} catch (e) {
116
this._regexpLoadFailed = true;
117
// console.warn('error loading language detection model', e);
118
return;
119
}
120
}
121
122
private async runRegexpModel(content: string, langBiases: Record<string, number>, supportedLangs?: string[]): Promise<string | undefined> {
123
const regexpModel = await this.getRegexpModel();
124
if (!regexpModel) { return; }
125
126
if (supportedLangs?.length) {
127
// When using supportedLangs, normally computed biases are too extreme. Just use a "bitmask" of sorts.
128
for (const lang of Object.keys(langBiases)) {
129
if (supportedLangs.includes(lang)) {
130
langBiases[lang] = 1;
131
} else {
132
langBiases[lang] = 0;
133
}
134
}
135
}
136
137
const detected = regexpModel.detect(content, langBiases, supportedLangs);
138
return detected;
139
}
140
141
private async getModelOperations(): Promise<ModelOperations> {
142
if (this._modelOperations) {
143
return this._modelOperations;
144
}
145
146
const uri: string = await this._host.$getIndexJsUri();
147
const { ModelOperations } = await importAMDNodeModule(uri, '') as typeof import('@vscode/vscode-languagedetection');
148
this._modelOperations = new ModelOperations({
149
modelJsonLoaderFunc: async () => {
150
const response = await fetch(await this._host.$getModelJsonUri());
151
try {
152
const modelJSON = await response.json();
153
return modelJSON;
154
} catch (e) {
155
const message = `Failed to parse model JSON.`;
156
throw new Error(message);
157
}
158
},
159
weightsLoaderFunc: async () => {
160
const response = await fetch(await this._host.$getWeightsUri());
161
const buffer = await response.arrayBuffer();
162
return buffer;
163
}
164
});
165
166
return this._modelOperations;
167
}
168
169
// This adjusts the language confidence scores to be more accurate based on:
170
// * VS Code's language usage
171
// * Languages with 'problematic' syntaxes that have caused incorrect language detection
172
private adjustLanguageConfidence(modelResult: ModelResult): ModelResult {
173
switch (modelResult.languageId) {
174
// For the following languages, we increase the confidence because
175
// these are commonly used languages in VS Code and supported
176
// by the model.
177
case 'js':
178
case 'html':
179
case 'json':
180
case 'ts':
181
case 'css':
182
case 'py':
183
case 'xml':
184
case 'php':
185
modelResult.confidence += LanguageDetectionWorker.positiveConfidenceCorrectionBucket1;
186
break;
187
// case 'yaml': // YAML has been know to cause incorrect language detection because the language is pretty simple. We don't want to increase the confidence for this.
188
case 'cpp':
189
case 'sh':
190
case 'java':
191
case 'cs':
192
case 'c':
193
modelResult.confidence += LanguageDetectionWorker.positiveConfidenceCorrectionBucket2;
194
break;
195
196
// For the following languages, we need to be extra confident that the language is correct because
197
// we've had issues like #131912 that caused incorrect guesses. To enforce this, we subtract the
198
// negativeConfidenceCorrection from the confidence.
199
200
// languages that are provided by default in VS Code
201
case 'bat':
202
case 'ini':
203
case 'makefile':
204
case 'sql':
205
// languages that aren't provided by default in VS Code
206
case 'csv':
207
case 'toml':
208
// Other considerations for negativeConfidenceCorrection that
209
// aren't built in but suported by the model include:
210
// * Assembly, TeX - These languages didn't have clear language modes in the community
211
// * Markdown, Dockerfile - These languages are simple but they embed other languages
212
modelResult.confidence -= LanguageDetectionWorker.negativeConfidenceCorrection;
213
break;
214
215
default:
216
break;
217
218
}
219
return modelResult;
220
}
221
222
private async * detectLanguagesImpl(content: string): AsyncGenerator<ModelResult, void, unknown> {
223
if (this._loadFailed) {
224
return;
225
}
226
227
let modelOperations: ModelOperations | undefined;
228
try {
229
modelOperations = await this.getModelOperations();
230
} catch (e) {
231
console.log(e);
232
this._loadFailed = true;
233
return;
234
}
235
236
let modelResults: ModelResult[] | undefined;
237
238
try {
239
modelResults = await modelOperations.runModel(content);
240
} catch (e) {
241
console.warn(e);
242
}
243
244
if (!modelResults
245
|| modelResults.length === 0
246
|| modelResults[0].confidence < LanguageDetectionWorker.expectedRelativeConfidence) {
247
return;
248
}
249
250
const firstModelResult = this.adjustLanguageConfidence(modelResults[0]);
251
if (firstModelResult.confidence < LanguageDetectionWorker.expectedRelativeConfidence) {
252
return;
253
}
254
255
const possibleLanguages: ModelResult[] = [firstModelResult];
256
257
for (let current of modelResults) {
258
if (current === firstModelResult) {
259
continue;
260
}
261
262
current = this.adjustLanguageConfidence(current);
263
const currentHighest = possibleLanguages[possibleLanguages.length - 1];
264
265
if (currentHighest.confidence - current.confidence >= LanguageDetectionWorker.expectedRelativeConfidence) {
266
while (possibleLanguages.length) {
267
yield possibleLanguages.shift()!;
268
}
269
if (current.confidence > LanguageDetectionWorker.expectedRelativeConfidence) {
270
possibleLanguages.push(current);
271
continue;
272
}
273
return;
274
} else {
275
if (current.confidence > LanguageDetectionWorker.expectedRelativeConfidence) {
276
possibleLanguages.push(current);
277
continue;
278
}
279
return;
280
}
281
}
282
}
283
}
284
285