Path: blob/main/extensions/copilot/src/platform/embeddings/common/embeddingsIndex.ts
13401 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import type { Memento, Uri } from 'vscode';6import { VSBuffer } from '../../../util/vs/base/common/buffer';7import { URI } from '../../../util/vs/base/common/uri';8import { IInstantiationService } from '../../../util/vs/platform/instantiation/common/instantiation';9import { IVSCodeExtensionContext } from '../../extContext/common/extensionContext';10import { fileSystemServiceReadAsJSON, IFileSystemService } from '../../filesystem/common/fileSystemService';11import { ILogService } from '../../log/common/logService';12import { IFetcherService } from '../../networking/common/fetcherService';13import { IWorkbenchService } from '../../workbench/common/workbenchService';14import { Embedding, EmbeddingType, EmbeddingVector, getWellKnownEmbeddingTypeInfo, IEmbeddingsComputer, LEGACY_EMBEDDING_MODEL_ID, rankEmbeddings } from './embeddingsComputer';1516interface EmbeddingsIndex<K, V> {17hasItem(value: K): boolean;18isIndexLoaded: boolean;19nClosestValues(embedding: Embedding, n: number): V[];20}2122type EmbeddingCacheEntries = { [key: string]: { embedding: EmbeddingVector } };23interface EmbeddingCacheEntriesWithExtensions {24core: EmbeddingCacheEntries;25extensions: { [key: string]: EmbeddingCacheEntries };26}2728export enum RemoteCacheType {29Settings = 'settings',30Commands = 'commands',31Api = 'api',32Extensions = 'extensions',33ProjectTemplates = 'project-templates',34Tools = 'tools'35}3637// These values are the blob storage container names where we publish computed embeddings38enum RemoteEmbeddingsContainer {39TEXT3SMALL = 'text-3-small',40METIS_1024_I16_BINARY = 'metis-1024-I16-Binary'41}4243function embeddingsModelToRemoteContainer(embeddingType: EmbeddingType): RemoteEmbeddingsContainer {44switch (getWellKnownEmbeddingTypeInfo(embeddingType)?.model) {45case LEGACY_EMBEDDING_MODEL_ID.Metis_I16_Binary:46return RemoteEmbeddingsContainer.METIS_1024_I16_BINARY;4748case LEGACY_EMBEDDING_MODEL_ID.TEXT3SMALL:49default:50return RemoteEmbeddingsContainer.TEXT3SMALL;51}52}5354export enum EmbeddingCacheType {55GLOBAL = 1,56WORKSPACE = 2,57}585960class EmbeddingsCache {61private readonly cacheVersionKey: string;6263constructor(64private readonly cacheType: EmbeddingCacheType,65private readonly cacheKey: string,66protected readonly cacheVersion: string,67@IFileSystemService private readonly fileSystemService: IFileSystemService,68@IVSCodeExtensionContext private readonly extensionContext: IVSCodeExtensionContext69) {70this.cacheVersionKey = `${cacheKey}-version`;71}7273public get cacheStorageUri(): Uri | undefined {74return this.cacheType === EmbeddingCacheType.WORKSPACE75? this.extensionContext.storageUri76: this.extensionContext.globalStorageUri;77}7879public get cacheVersionMementoStorage(): Memento {80return this.cacheType === EmbeddingCacheType.WORKSPACE81? this.extensionContext.workspaceState82: this.extensionContext.globalState;83}8485public async updateCache<T = EmbeddingCacheEntries>(value: T | undefined) {86if (!this.cacheStorageUri || value === undefined) {87return;88}89// Cannot write to readonly file system90if (!this.fileSystemService.isWritableFileSystem(this.cacheStorageUri.scheme)) {91return;92}93// Create directory at stoageUri if it doesn't exist94try {95await this.fileSystemService.stat(this.cacheStorageUri);96} catch (e) {97if (e.code === 'ENOENT') {98// Directory doesn't exist we should create it99await this.fileSystemService.createDirectory(this.cacheStorageUri);100}101}102// Update cache version103await this.cacheVersionMementoStorage.update(this.cacheVersionKey, this.cacheVersion);104const cacheFile = URI.joinPath(this.cacheStorageUri, `${this.cacheKey}.json`);105try {106await this.fileSystemService.writeFile(cacheFile, VSBuffer.fromString(JSON.stringify(value)).buffer);107} catch (e) {108if (value !== undefined) {109console.error(`Failed to write embeddings cache to ${cacheFile}`);110}111}112}113114public async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {115if (!this.cacheStorageUri) {116return;117}118const cacheVersion = this.cacheVersionMementoStorage.get<string>(this.cacheVersionKey);119120if (cacheVersion !== this.cacheVersion) {121return undefined;122}123try {124const cacheEntries: any = await fileSystemServiceReadAsJSON.readJSON<T>(this.fileSystemService, URI.joinPath(this.cacheStorageUri, `${this.cacheKey}.json`));125if (this.isEmbeddingCacheEntriesType(cacheEntries)) {126// If the cache is of the type EmbeddingCacheEntriesWithExtensions (during tests), we need to flatten it127return this.constructExposedCache(cacheEntries as EmbeddingCacheEntriesWithExtensions) as T;128}129130return cacheEntries as T;131132} catch {133return undefined;134}135}136137public async clearCache() {138if (!this.cacheStorageUri) {139return;140}141142const hasOldCache = this.cacheVersionMementoStorage.get(this.cacheKey);143if (hasOldCache) {144await this.cacheVersionMementoStorage.update(this.cacheKey, undefined);145}146147const cacheFile = URI.joinPath(this.cacheStorageUri, `${this.cacheKey}.json`);148try {149await this.fileSystemService.stat(this.cacheStorageUri);150await this.fileSystemService.delete(cacheFile, { useTrash: false });151} catch (e) {152if (e.code === 'ENOENT') {153throw new Error(`Cache file ${cacheFile} does not exist`);154}155}156}157158private isEmbeddingCacheEntriesType(cache: EmbeddingCacheEntries | EmbeddingCacheEntriesWithExtensions) {159return cache.core !== undefined && cache.extensions !== undefined;160}161162private constructExposedCache(cache: EmbeddingCacheEntriesWithExtensions): EmbeddingCacheEntries | undefined {163const flattenedCache: EmbeddingCacheEntries = { ...cache.core };164for (const extensionId in cache.extensions) {165const extensionCache = cache.extensions[extensionId];166for (const key in extensionCache) {167flattenedCache[key] = extensionCache[key];168}169}170return flattenedCache;171}172173}174175export interface IEmbeddingsCache {176readonly embeddingType: EmbeddingType;177178getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined>;179clearCache(): Promise<void>;180}181182/**183* A local cache which caches information on disk.184*/185export class LocalEmbeddingsCache implements IEmbeddingsCache {186187private readonly _embeddingsCache: EmbeddingsCache;188constructor(189cacheType: EmbeddingCacheType,190private readonly cacheKey: string,191private readonly cacheVersion: string,192public readonly embeddingType: EmbeddingType,193@IInstantiationService instantiationService: IInstantiationService194) {195this._embeddingsCache = instantiationService.createInstance(196EmbeddingsCache,197cacheType,198cacheKey,199cacheVersion200);201}202203public async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {204const cacheEntries: any = await this._embeddingsCache.getCache();205if (cacheEntries === undefined) {206throw new Error(`Failed to get cache for ${this.cacheKey}, version ${this.cacheVersion}`);207}208return cacheEntries;209}210211clearCache(): Promise<void> {212return this._embeddingsCache.clearCache();213}214}215216/**217* An embeddings cache which fetches embeddings from a remote CDN.218* It is limited to one remote file219*/220export class RemoteEmbeddingsCache implements IEmbeddingsCache {221private _remoteCacheEntries: EmbeddingCacheEntries | undefined;222private readonly remoteCacheVersionKey: string;223224private _remoteCacheURL: string | undefined;225private _remoteCacheLatestUpdateURL: string | undefined;226protected embeddingsCache: EmbeddingsCache;227228constructor(229cacheType: EmbeddingCacheType,230cacheKey: string,231protected readonly cacheVersion: string,232public readonly embeddingType: EmbeddingType,233protected readonly remoteCacheType: RemoteCacheType,234@IFetcherService protected readonly fetcherService: IFetcherService,235@IInstantiationService instantiationService: IInstantiationService236) {237this.embeddingsCache = instantiationService.createInstance(238EmbeddingsCache,239cacheType,240cacheKey,241cacheVersion242);243this.remoteCacheVersionKey = `${cacheKey}-version-remote`;244}245246async clearCache(): Promise<void> {247await this.embeddingsCache.clearCache();248}249250protected async getRemoteContainer(): Promise<RemoteEmbeddingsContainer> {251return embeddingsModelToRemoteContainer(this.embeddingType);252}253254private async getRemoteCacheURL(): Promise<string> {255if (!this._remoteCacheURL) {256const remoteCacheContainer = await this.getRemoteContainer();257this._remoteCacheURL = RemoteEmbeddingsCache.calculateRemoteCDNURL(remoteCacheContainer, this.remoteCacheType, this.cacheVersion);258}259return this._remoteCacheURL!;260}261262private async getRemoteCacheLatestUpdateURL(): Promise<string> {263if (!this._remoteCacheLatestUpdateURL) {264const remoteCacheContainer = await this.getRemoteContainer();265this._remoteCacheLatestUpdateURL = RemoteEmbeddingsCache.calculateRemoteCDNLatestURL(remoteCacheContainer, this.remoteCacheType, this.cacheVersion);266}267return this._remoteCacheLatestUpdateURL!;268}269270protected async fetchRemoteCache(): Promise<EmbeddingCacheEntries | undefined> {271if (this._remoteCacheEntries) {272return this._remoteCacheEntries;273}274const remoteCacheURL = await this.getRemoteCacheURL();275try {276const remoteCacheURL = await this.getRemoteCacheURL();277const response = await this.fetcherService.fetch(remoteCacheURL, { method: 'GET', callSite: 'embeddings-remote-cache' });278if (response.ok) {279this._remoteCacheEntries = (await response.json()) as EmbeddingCacheEntries;280return this._remoteCacheEntries;281} else {282console.error(`Failed to fetch remote embeddings cache from ${remoteCacheURL}`);283console.error(`Response status: ${response.status}, status text: ${response.statusText}`);284return;285}286} catch (err) {287console.error(`Failed to fetch remote embeddings cache from ${remoteCacheURL}`);288console.error(err);289return;290}291}292293protected async fetchRemoteCacheLatest(): Promise<string | undefined> {294const remoteCacheLatestUpdateURL = await this.getRemoteCacheLatestUpdateURL();295try {296const response = await this.fetcherService.fetch(remoteCacheLatestUpdateURL, { method: 'GET', callSite: 'embeddings-remote-cache-latest' });297if (response.ok) {298return response.text();299} else {300console.error(`Failed to fetch remote embeddings cache from ${remoteCacheLatestUpdateURL}`);301console.error(`Response status: ${response.status}, status text: ${response.statusText}`);302return;303}304} catch (err) {305console.error(`Failed to fetch remote embeddings cache from ${remoteCacheLatestUpdateURL}`);306console.error(err);307return;308}309}310311public async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {312const remoteCacheLatest = await this.fetchRemoteCacheLatest();313const cache = await this.embeddingsCache.getCache();314// If the cache exists and the remote cache version is a match,315// it means it is the latest version and we can return it,316// otherwise we will fetch again the remote cache317if (cache && remoteCacheLatest === this.embeddingsCache.cacheVersionMementoStorage.get<string>(this.remoteCacheVersionKey)) {318return cache as T;319}320const remoteCache = await this.fetchRemoteCache();321if (remoteCache === undefined) {322// fallback to previous local cache if remote cache is unavailable323return cache as T;324}325326await this.embeddingsCache.clearCache();327await this.embeddingsCache.cacheVersionMementoStorage.update(this.remoteCacheVersionKey, remoteCacheLatest);328await this.embeddingsCache.updateCache(remoteCache);329return remoteCache as T;330}331332static calculateRemoteCDNURL(cacheContainer: RemoteEmbeddingsContainer, embeddingsType: RemoteCacheType, cacheVersion: string): string {333return `https://embeddings.vscode-cdn.net/${cacheContainer}/v${cacheVersion}/${embeddingsType}/core.json`;334}335336static calculateRemoteCDNLatestURL(cacheContainer: RemoteEmbeddingsContainer, embeddingsType: RemoteCacheType, cacheVersion: string): string {337return `https://embeddings.vscode-cdn.net/${cacheContainer}/v${cacheVersion}/${embeddingsType}/latest.txt`;338}339}340341/**342* A remote cache which is also aware of installed extensions and updates properly when they are updated, installed, or uninstalled343* Internally we use a nested structure which breaks down core, and each extension id for better perf.344* Externally a flattened cache with all values on the same level is exposed for easier consumption and to conform to the other cache interfaces.345* When updating the cache we use the internal structure rather than the flatten one because the flattened on is only for external consumption.346*/347export class RemoteEmbeddingsExtensionCache extends RemoteEmbeddingsCache {348// This is a nested structure used to help us do just patching of updated extensions349private _remoteExtensionCache: EmbeddingCacheEntriesWithExtensions | undefined;350private _baseExtensionCDNURL: string | undefined;351352constructor(353cacheType: EmbeddingCacheType,354cacheKey: string,355cacheVersion: string,356embeddingType: EmbeddingType,357remoteCacheType: RemoteCacheType,358@IFetcherService fetcher: IFetcherService,359@IWorkbenchService private readonly workbenchService: IWorkbenchService,360@IInstantiationService instantiationService: IInstantiationService,361) {362super(cacheType, cacheKey, cacheVersion, embeddingType, remoteCacheType, fetcher, instantiationService);363}364365private async getBaseExtensionCDNURL(): Promise<string> {366if (!this._baseExtensionCDNURL) {367const remoteCacheContainer = await this.getRemoteContainer();368this._baseExtensionCDNURL = RemoteEmbeddingsExtensionCache.calculateBaseRemoteExtensionCDNURL(remoteCacheContainer, this.remoteCacheType, this.cacheVersion);369}370return this._baseExtensionCDNURL!;371}372373private constructExposedCache(): EmbeddingCacheEntries | undefined {374if (!this._remoteExtensionCache) {375return;376}377const flattenedCache: EmbeddingCacheEntries = { ...this._remoteExtensionCache.core };378for (const extensionId in this._remoteExtensionCache.extensions) {379const extensionCache = this._remoteExtensionCache.extensions[extensionId];380for (const key in extensionCache) {381flattenedCache[key] = extensionCache[key];382}383}384return flattenedCache;385}386387private async fetchRemoteExtensionCache(extensionId: string): Promise<EmbeddingCacheEntries | undefined> {388const baseExtensionCDNURL = await this.getBaseExtensionCDNURL();389const extensionUrl = `${baseExtensionCDNURL}/${extensionId}.json`;390try {391const response = await this.fetcherService.fetch(extensionUrl, { method: 'GET', callSite: 'embeddings-extension-cache' });392if (response.ok) {393return (await response.json()) as EmbeddingCacheEntries;394} else {395if (response.status === 404) {396// The file doesn't exist on our CDN return an empty object so we don't try to fetch it again397return {};398}399console.error(`Failed to fetch remote embeddings cache from ${extensionUrl}`);400console.error(`Response status: ${response.status}, status text: ${response.statusText}`);401return;402}403} catch (err) {404console.error(`Failed to fetch remote embeddings cache from ${extensionUrl}`);405console.error(err);406return;407}408}409410public override async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {411const coreOrLocalCache = await super.getCache<EmbeddingCacheEntries | EmbeddingCacheEntriesWithExtensions>();412// The remote cache for core coming back unavaiable indicates request problems so we cannot continue with fetching extensions413if (coreOrLocalCache === undefined) {414return;415}416let currentCache: EmbeddingCacheEntriesWithExtensions = { core: {}, extensions: {} };417// Check if the cache has a property 'core' as the RemoteCachewithExtensions has it418if (419coreOrLocalCache &&420RemoteEmbeddingsExtensionCache.isEmbeddingsCacheEntriesWithExtensions(coreOrLocalCache)421) {422currentCache = coreOrLocalCache;423} else {424currentCache = { core: coreOrLocalCache, extensions: {} };425}426427const activatedExtensionIds = RemoteEmbeddingsExtensionCache.getInstalledExtensionIds(this.workbenchService);428let removedExtensions = false;429// Remove any extensions from currentCache which aren't in activatedExtensionIds430for (const extensionId in currentCache.extensions) {431if (!activatedExtensionIds.includes(extensionId)) {432delete currentCache.extensions[extensionId];433removedExtensions = true;434}435}436const extensionIdsToFetch = activatedExtensionIds.filter(437id => !(id in currentCache.extensions) || currentCache.extensions[id] === undefined438);439440for (const extensionId of extensionIdsToFetch) {441const extensionCache = await this.fetchRemoteExtensionCache(extensionId);442if (extensionCache) {443currentCache.extensions[extensionId] = extensionCache;444}445}446447this._remoteExtensionCache = currentCache;448if (extensionIdsToFetch.length > 0 || removedExtensions) {449await this.embeddingsCache.clearCache();450await this.embeddingsCache.updateCache(currentCache);451}452453return this.constructExposedCache() as T;454}455456static isEmbeddingsCacheEntriesWithExtensions(obj: any): obj is EmbeddingCacheEntriesWithExtensions {457return 'core' in obj && 'extensions' in obj;458}459460static getInstalledExtensionIds(workbenchService: IWorkbenchService): string[] {461return workbenchService.getAllExtensions().filter(e => !e.id.startsWith('vscode')).map(e => e.id);462}463464static calculateBaseRemoteExtensionCDNURL(cacheContainer: RemoteEmbeddingsContainer, embeddingsType: RemoteCacheType, cacheVersion: string): string {465return `https://embeddings.vscode-cdn.net/${cacheContainer}/v${cacheVersion}/${embeddingsType}`;466}467}468469export abstract class BaseEmbeddingsIndex<V extends { key: string; embedding?: EmbeddingVector }>470implements EmbeddingsIndex<string, V> {471protected _items: Map<string, V>;472private _isIndexLoaded = false;473private _calculationPromise: Promise<void> | undefined;474475constructor(476loggerContext: string,477private readonly embeddingType: EmbeddingType,478private readonly cacheKey: string,479private readonly _embeddingsCache: IEmbeddingsCache,480protected readonly embeddingsComputer: IEmbeddingsComputer,481protected readonly logService: ILogService,482) {483this._items = new Map<string, V>();484}485486public get isIndexLoaded(): boolean {487return this._isIndexLoaded;488}489490protected set isIndexLoaded(value: boolean) {491this._isIndexLoaded = value;492}493494public async rebuildCache() {495await this._embeddingsCache.clearCache();496this._items.clear();497return this.calculateEmbeddings();498}499500/**501* Finds the n closest values to a given embedding502* @param queryEmbedding The embedding to find the n closest values for503* @param n The number of closest values to return504* @returns The n closest values to the embedding, sorted by similarity. Could be less than n if there are less than n items indexed505*/506public nClosestValues(queryEmbedding: Embedding, n: number): V[] {507return rankEmbeddings(queryEmbedding, Array.from(this._items.values()).filter(x => x.embedding).map(x => [x, { value: x.embedding!, type: this.embeddingType } satisfies Embedding] as const), n)508.map(x => x.value);509}510511public hasItem(key: string): boolean {512return this._items.has(key);513}514515public getItem(key: string): V | undefined {516return this._items.get(key);517}518519public async calculateEmbeddings(): Promise<void> {520// This prevents being able to queue many calculations at once since it should always be referring to the same promise521if (this._calculationPromise) {522return this._calculationPromise;523}524this._calculationPromise = this._calculateEmbeddings();525return this._calculationPromise.then(() => (this._calculationPromise = undefined));526}527528private async _calculateEmbeddings(): Promise<void> {529const startTime = Date.now();530const allItems: V[] = await this.getLatestItems();531const cachedEmbeddings = await this._embeddingsCache.getCache();532// check that the cached embeddings is of flattened format, if not, we need to construct it533const latestEmbeddingsIndex = new Map<string, V>();534for (const item of allItems) {535let newItem = item;536const oldItem = this._items.get(item.key);537const key = item.key;538// We have it in our current index539if (oldItem?.embedding) {540newItem = oldItem;541} else if (cachedEmbeddings && cachedEmbeddings[key]) {542// We have it in our cache543newItem = { ...item, ...cachedEmbeddings[key] };544}545546latestEmbeddingsIndex.set(key, newItem);547}548549this._items = latestEmbeddingsIndex;550551this.logService.debug(`Embeddings for ${this.cacheKey} calculated in ${Date.now() - startTime}ms`);552this.isIndexLoaded = true;553}554555/**556* Converts the value into the string that will be used to calculate the embedding557* @param value The value to convert to a natural language query558* @returns The natural language query559*/560protected abstract getEmbeddingQueryString(value: V): string;561562protected abstract getLatestItems(): Promise<V[]>;563}564565