Path: blob/main/src/vs/workbench/services/languageDetection/browser/languageDetectionWebWorker.ts
3296 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import type { ModelOperations, ModelResult } from '@vscode/vscode-languagedetection';6import { importAMDNodeModule } from '../../../../amdX.js';7import { StopWatch } from '../../../../base/common/stopwatch.js';8import { IWebWorkerServerRequestHandler, IWebWorkerServer } from '../../../../base/common/worker/webWorker.js';9import { LanguageDetectionWorkerHost, ILanguageDetectionWorker } from './languageDetectionWorker.protocol.js';10import { WorkerTextModelSyncServer } from '../../../../editor/common/services/textModelSync/textModelSync.impl.js';1112type RegexpModel = { detect: (inp: string, langBiases: Record<string, number>, supportedLangs?: string[]) => string | undefined };1314export function create(workerServer: IWebWorkerServer): IWebWorkerServerRequestHandler {15return new LanguageDetectionWorker(workerServer);16}1718/**19* @internal20*/21export class LanguageDetectionWorker implements ILanguageDetectionWorker {22_requestHandlerBrand: any;2324private static readonly expectedRelativeConfidence = 0.2;25private static readonly positiveConfidenceCorrectionBucket1 = 0.05;26private static readonly positiveConfidenceCorrectionBucket2 = 0.025;27private static readonly negativeConfidenceCorrection = 0.5;2829private readonly _workerTextModelSyncServer = new WorkerTextModelSyncServer();3031private readonly _host: LanguageDetectionWorkerHost;32private _regexpModel: RegexpModel | undefined;33private _regexpLoadFailed: boolean = false;3435private _modelOperations: ModelOperations | undefined;36private _loadFailed: boolean = false;3738private modelIdToCoreId = new Map<string, string | undefined>();3940constructor(workerServer: IWebWorkerServer) {41this._host = LanguageDetectionWorkerHost.getChannel(workerServer);42this._workerTextModelSyncServer.bindToServer(workerServer);43}4445public async $detectLanguage(uri: string, langBiases: Record<string, number> | undefined, preferHistory: boolean, supportedLangs?: string[]): Promise<string | undefined> {46const languages: string[] = [];47const confidences: number[] = [];48const stopWatch = new StopWatch();49const documentTextSample = this.getTextForDetection(uri);50if (!documentTextSample) { return; }5152const neuralResolver = async () => {53for await (const language of this.detectLanguagesImpl(documentTextSample)) {54if (!this.modelIdToCoreId.has(language.languageId)) {55this.modelIdToCoreId.set(language.languageId, await this._host.$getLanguageId(language.languageId));56}57const coreId = this.modelIdToCoreId.get(language.languageId);58if (coreId && (!supportedLangs?.length || supportedLangs.includes(coreId))) {59languages.push(coreId);60confidences.push(language.confidence);61}62}63stopWatch.stop();6465if (languages.length) {66this._host.$sendTelemetryEvent(languages, confidences, stopWatch.elapsed());67return languages[0];68}69return undefined;70};7172const historicalResolver = async () => this.runRegexpModel(documentTextSample, langBiases ?? {}, supportedLangs);7374if (preferHistory) {75const history = await historicalResolver();76if (history) { return history; }77const neural = await neuralResolver();78if (neural) { return neural; }79} else {80const neural = await neuralResolver();81if (neural) { return neural; }82const history = await historicalResolver();83if (history) { return history; }84}8586return undefined;87}8889private getTextForDetection(uri: string): string | undefined {90const editorModel = this._workerTextModelSyncServer.getModel(uri);91if (!editorModel) { return; }9293const end = editorModel.positionAt(10000);94const content = editorModel.getValueInRange({95startColumn: 1,96startLineNumber: 1,97endColumn: end.column,98endLineNumber: end.lineNumber99});100return content;101}102103private async getRegexpModel(): Promise<RegexpModel | undefined> {104if (this._regexpLoadFailed) {105return;106}107if (this._regexpModel) {108return this._regexpModel;109}110const uri: string = await this._host.$getRegexpModelUri();111try {112this._regexpModel = await importAMDNodeModule(uri, '') as RegexpModel;113return this._regexpModel;114} catch (e) {115this._regexpLoadFailed = true;116// console.warn('error loading language detection model', e);117return;118}119}120121private async runRegexpModel(content: string, langBiases: Record<string, number>, supportedLangs?: string[]): Promise<string | undefined> {122const regexpModel = await this.getRegexpModel();123if (!regexpModel) { return; }124125if (supportedLangs?.length) {126// When using supportedLangs, normally computed biases are too extreme. Just use a "bitmask" of sorts.127for (const lang of Object.keys(langBiases)) {128if (supportedLangs.includes(lang)) {129langBiases[lang] = 1;130} else {131langBiases[lang] = 0;132}133}134}135136const detected = regexpModel.detect(content, langBiases, supportedLangs);137return detected;138}139140private async getModelOperations(): Promise<ModelOperations> {141if (this._modelOperations) {142return this._modelOperations;143}144145const uri: string = await this._host.$getIndexJsUri();146const { ModelOperations } = await importAMDNodeModule(uri, '') as typeof import('@vscode/vscode-languagedetection');147this._modelOperations = new ModelOperations({148modelJsonLoaderFunc: async () => {149const response = await fetch(await this._host.$getModelJsonUri());150try {151const modelJSON = await response.json();152return modelJSON;153} catch (e) {154const message = `Failed to parse model JSON.`;155throw new Error(message);156}157},158weightsLoaderFunc: async () => {159const response = await fetch(await this._host.$getWeightsUri());160const buffer = await response.arrayBuffer();161return buffer;162}163});164165return this._modelOperations;166}167168// This adjusts the language confidence scores to be more accurate based on:169// * VS Code's language usage170// * Languages with 'problematic' syntaxes that have caused incorrect language detection171private adjustLanguageConfidence(modelResult: ModelResult): ModelResult {172switch (modelResult.languageId) {173// For the following languages, we increase the confidence because174// these are commonly used languages in VS Code and supported175// by the model.176case 'js':177case 'html':178case 'json':179case 'ts':180case 'css':181case 'py':182case 'xml':183case 'php':184modelResult.confidence += LanguageDetectionWorker.positiveConfidenceCorrectionBucket1;185break;186// case 'yaml': // YAML has been know to cause incorrect language detection because the language is pretty simple. We don't want to increase the confidence for this.187case 'cpp':188case 'sh':189case 'java':190case 'cs':191case 'c':192modelResult.confidence += LanguageDetectionWorker.positiveConfidenceCorrectionBucket2;193break;194195// For the following languages, we need to be extra confident that the language is correct because196// we've had issues like #131912 that caused incorrect guesses. To enforce this, we subtract the197// negativeConfidenceCorrection from the confidence.198199// languages that are provided by default in VS Code200case 'bat':201case 'ini':202case 'makefile':203case 'sql':204// languages that aren't provided by default in VS Code205case 'csv':206case 'toml':207// Other considerations for negativeConfidenceCorrection that208// aren't built in but suported by the model include:209// * Assembly, TeX - These languages didn't have clear language modes in the community210// * Markdown, Dockerfile - These languages are simple but they embed other languages211modelResult.confidence -= LanguageDetectionWorker.negativeConfidenceCorrection;212break;213214default:215break;216217}218return modelResult;219}220221private async * detectLanguagesImpl(content: string): AsyncGenerator<ModelResult, void, unknown> {222if (this._loadFailed) {223return;224}225226let modelOperations: ModelOperations | undefined;227try {228modelOperations = await this.getModelOperations();229} catch (e) {230console.log(e);231this._loadFailed = true;232return;233}234235let modelResults: ModelResult[] | undefined;236237try {238modelResults = await modelOperations.runModel(content);239} catch (e) {240console.warn(e);241}242243if (!modelResults244|| modelResults.length === 0245|| modelResults[0].confidence < LanguageDetectionWorker.expectedRelativeConfidence) {246return;247}248249const firstModelResult = this.adjustLanguageConfidence(modelResults[0]);250if (firstModelResult.confidence < LanguageDetectionWorker.expectedRelativeConfidence) {251return;252}253254const possibleLanguages: ModelResult[] = [firstModelResult];255256for (let current of modelResults) {257if (current === firstModelResult) {258continue;259}260261current = this.adjustLanguageConfidence(current);262const currentHighest = possibleLanguages[possibleLanguages.length - 1];263264if (currentHighest.confidence - current.confidence >= LanguageDetectionWorker.expectedRelativeConfidence) {265while (possibleLanguages.length) {266yield possibleLanguages.shift()!;267}268if (current.confidence > LanguageDetectionWorker.expectedRelativeConfidence) {269possibleLanguages.push(current);270continue;271}272return;273} else {274if (current.confidence > LanguageDetectionWorker.expectedRelativeConfidence) {275possibleLanguages.push(current);276continue;277}278return;279}280}281}282}283284285