Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/embeddings/common/embeddingsIndex.ts
13401 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import type { Memento, Uri } from 'vscode';
7
import { VSBuffer } from '../../../util/vs/base/common/buffer';
8
import { URI } from '../../../util/vs/base/common/uri';
9
import { IInstantiationService } from '../../../util/vs/platform/instantiation/common/instantiation';
10
import { IVSCodeExtensionContext } from '../../extContext/common/extensionContext';
11
import { fileSystemServiceReadAsJSON, IFileSystemService } from '../../filesystem/common/fileSystemService';
12
import { ILogService } from '../../log/common/logService';
13
import { IFetcherService } from '../../networking/common/fetcherService';
14
import { IWorkbenchService } from '../../workbench/common/workbenchService';
15
import { Embedding, EmbeddingType, EmbeddingVector, getWellKnownEmbeddingTypeInfo, IEmbeddingsComputer, LEGACY_EMBEDDING_MODEL_ID, rankEmbeddings } from './embeddingsComputer';
16
17
interface EmbeddingsIndex<K, V> {
18
hasItem(value: K): boolean;
19
isIndexLoaded: boolean;
20
nClosestValues(embedding: Embedding, n: number): V[];
21
}
22
23
type EmbeddingCacheEntries = { [key: string]: { embedding: EmbeddingVector } };
24
interface EmbeddingCacheEntriesWithExtensions {
25
core: EmbeddingCacheEntries;
26
extensions: { [key: string]: EmbeddingCacheEntries };
27
}
28
29
export enum RemoteCacheType {
30
Settings = 'settings',
31
Commands = 'commands',
32
Api = 'api',
33
Extensions = 'extensions',
34
ProjectTemplates = 'project-templates',
35
Tools = 'tools'
36
}
37
38
// These values are the blob storage container names where we publish computed embeddings
39
enum RemoteEmbeddingsContainer {
40
TEXT3SMALL = 'text-3-small',
41
METIS_1024_I16_BINARY = 'metis-1024-I16-Binary'
42
}
43
44
function embeddingsModelToRemoteContainer(embeddingType: EmbeddingType): RemoteEmbeddingsContainer {
45
switch (getWellKnownEmbeddingTypeInfo(embeddingType)?.model) {
46
case LEGACY_EMBEDDING_MODEL_ID.Metis_I16_Binary:
47
return RemoteEmbeddingsContainer.METIS_1024_I16_BINARY;
48
49
case LEGACY_EMBEDDING_MODEL_ID.TEXT3SMALL:
50
default:
51
return RemoteEmbeddingsContainer.TEXT3SMALL;
52
}
53
}
54
55
export enum EmbeddingCacheType {
56
GLOBAL = 1,
57
WORKSPACE = 2,
58
}
59
60
61
class EmbeddingsCache {
62
private readonly cacheVersionKey: string;
63
64
constructor(
65
private readonly cacheType: EmbeddingCacheType,
66
private readonly cacheKey: string,
67
protected readonly cacheVersion: string,
68
@IFileSystemService private readonly fileSystemService: IFileSystemService,
69
@IVSCodeExtensionContext private readonly extensionContext: IVSCodeExtensionContext
70
) {
71
this.cacheVersionKey = `${cacheKey}-version`;
72
}
73
74
public get cacheStorageUri(): Uri | undefined {
75
return this.cacheType === EmbeddingCacheType.WORKSPACE
76
? this.extensionContext.storageUri
77
: this.extensionContext.globalStorageUri;
78
}
79
80
public get cacheVersionMementoStorage(): Memento {
81
return this.cacheType === EmbeddingCacheType.WORKSPACE
82
? this.extensionContext.workspaceState
83
: this.extensionContext.globalState;
84
}
85
86
public async updateCache<T = EmbeddingCacheEntries>(value: T | undefined) {
87
if (!this.cacheStorageUri || value === undefined) {
88
return;
89
}
90
// Cannot write to readonly file system
91
if (!this.fileSystemService.isWritableFileSystem(this.cacheStorageUri.scheme)) {
92
return;
93
}
94
// Create directory at stoageUri if it doesn't exist
95
try {
96
await this.fileSystemService.stat(this.cacheStorageUri);
97
} catch (e) {
98
if (e.code === 'ENOENT') {
99
// Directory doesn't exist we should create it
100
await this.fileSystemService.createDirectory(this.cacheStorageUri);
101
}
102
}
103
// Update cache version
104
await this.cacheVersionMementoStorage.update(this.cacheVersionKey, this.cacheVersion);
105
const cacheFile = URI.joinPath(this.cacheStorageUri, `${this.cacheKey}.json`);
106
try {
107
await this.fileSystemService.writeFile(cacheFile, VSBuffer.fromString(JSON.stringify(value)).buffer);
108
} catch (e) {
109
if (value !== undefined) {
110
console.error(`Failed to write embeddings cache to ${cacheFile}`);
111
}
112
}
113
}
114
115
public async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {
116
if (!this.cacheStorageUri) {
117
return;
118
}
119
const cacheVersion = this.cacheVersionMementoStorage.get<string>(this.cacheVersionKey);
120
121
if (cacheVersion !== this.cacheVersion) {
122
return undefined;
123
}
124
try {
125
const cacheEntries: any = await fileSystemServiceReadAsJSON.readJSON<T>(this.fileSystemService, URI.joinPath(this.cacheStorageUri, `${this.cacheKey}.json`));
126
if (this.isEmbeddingCacheEntriesType(cacheEntries)) {
127
// If the cache is of the type EmbeddingCacheEntriesWithExtensions (during tests), we need to flatten it
128
return this.constructExposedCache(cacheEntries as EmbeddingCacheEntriesWithExtensions) as T;
129
}
130
131
return cacheEntries as T;
132
133
} catch {
134
return undefined;
135
}
136
}
137
138
public async clearCache() {
139
if (!this.cacheStorageUri) {
140
return;
141
}
142
143
const hasOldCache = this.cacheVersionMementoStorage.get(this.cacheKey);
144
if (hasOldCache) {
145
await this.cacheVersionMementoStorage.update(this.cacheKey, undefined);
146
}
147
148
const cacheFile = URI.joinPath(this.cacheStorageUri, `${this.cacheKey}.json`);
149
try {
150
await this.fileSystemService.stat(this.cacheStorageUri);
151
await this.fileSystemService.delete(cacheFile, { useTrash: false });
152
} catch (e) {
153
if (e.code === 'ENOENT') {
154
throw new Error(`Cache file ${cacheFile} does not exist`);
155
}
156
}
157
}
158
159
private isEmbeddingCacheEntriesType(cache: EmbeddingCacheEntries | EmbeddingCacheEntriesWithExtensions) {
160
return cache.core !== undefined && cache.extensions !== undefined;
161
}
162
163
private constructExposedCache(cache: EmbeddingCacheEntriesWithExtensions): EmbeddingCacheEntries | undefined {
164
const flattenedCache: EmbeddingCacheEntries = { ...cache.core };
165
for (const extensionId in cache.extensions) {
166
const extensionCache = cache.extensions[extensionId];
167
for (const key in extensionCache) {
168
flattenedCache[key] = extensionCache[key];
169
}
170
}
171
return flattenedCache;
172
}
173
174
}
175
176
export interface IEmbeddingsCache {
177
readonly embeddingType: EmbeddingType;
178
179
getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined>;
180
clearCache(): Promise<void>;
181
}
182
183
/**
184
* A local cache which caches information on disk.
185
*/
186
export class LocalEmbeddingsCache implements IEmbeddingsCache {
187
188
private readonly _embeddingsCache: EmbeddingsCache;
189
constructor(
190
cacheType: EmbeddingCacheType,
191
private readonly cacheKey: string,
192
private readonly cacheVersion: string,
193
public readonly embeddingType: EmbeddingType,
194
@IInstantiationService instantiationService: IInstantiationService
195
) {
196
this._embeddingsCache = instantiationService.createInstance(
197
EmbeddingsCache,
198
cacheType,
199
cacheKey,
200
cacheVersion
201
);
202
}
203
204
public async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {
205
const cacheEntries: any = await this._embeddingsCache.getCache();
206
if (cacheEntries === undefined) {
207
throw new Error(`Failed to get cache for ${this.cacheKey}, version ${this.cacheVersion}`);
208
}
209
return cacheEntries;
210
}
211
212
clearCache(): Promise<void> {
213
return this._embeddingsCache.clearCache();
214
}
215
}
216
217
/**
218
* An embeddings cache which fetches embeddings from a remote CDN.
219
* It is limited to one remote file
220
*/
221
export class RemoteEmbeddingsCache implements IEmbeddingsCache {
222
private _remoteCacheEntries: EmbeddingCacheEntries | undefined;
223
private readonly remoteCacheVersionKey: string;
224
225
private _remoteCacheURL: string | undefined;
226
private _remoteCacheLatestUpdateURL: string | undefined;
227
protected embeddingsCache: EmbeddingsCache;
228
229
constructor(
230
cacheType: EmbeddingCacheType,
231
cacheKey: string,
232
protected readonly cacheVersion: string,
233
public readonly embeddingType: EmbeddingType,
234
protected readonly remoteCacheType: RemoteCacheType,
235
@IFetcherService protected readonly fetcherService: IFetcherService,
236
@IInstantiationService instantiationService: IInstantiationService
237
) {
238
this.embeddingsCache = instantiationService.createInstance(
239
EmbeddingsCache,
240
cacheType,
241
cacheKey,
242
cacheVersion
243
);
244
this.remoteCacheVersionKey = `${cacheKey}-version-remote`;
245
}
246
247
async clearCache(): Promise<void> {
248
await this.embeddingsCache.clearCache();
249
}
250
251
protected async getRemoteContainer(): Promise<RemoteEmbeddingsContainer> {
252
return embeddingsModelToRemoteContainer(this.embeddingType);
253
}
254
255
private async getRemoteCacheURL(): Promise<string> {
256
if (!this._remoteCacheURL) {
257
const remoteCacheContainer = await this.getRemoteContainer();
258
this._remoteCacheURL = RemoteEmbeddingsCache.calculateRemoteCDNURL(remoteCacheContainer, this.remoteCacheType, this.cacheVersion);
259
}
260
return this._remoteCacheURL!;
261
}
262
263
private async getRemoteCacheLatestUpdateURL(): Promise<string> {
264
if (!this._remoteCacheLatestUpdateURL) {
265
const remoteCacheContainer = await this.getRemoteContainer();
266
this._remoteCacheLatestUpdateURL = RemoteEmbeddingsCache.calculateRemoteCDNLatestURL(remoteCacheContainer, this.remoteCacheType, this.cacheVersion);
267
}
268
return this._remoteCacheLatestUpdateURL!;
269
}
270
271
protected async fetchRemoteCache(): Promise<EmbeddingCacheEntries | undefined> {
272
if (this._remoteCacheEntries) {
273
return this._remoteCacheEntries;
274
}
275
const remoteCacheURL = await this.getRemoteCacheURL();
276
try {
277
const remoteCacheURL = await this.getRemoteCacheURL();
278
const response = await this.fetcherService.fetch(remoteCacheURL, { method: 'GET', callSite: 'embeddings-remote-cache' });
279
if (response.ok) {
280
this._remoteCacheEntries = (await response.json()) as EmbeddingCacheEntries;
281
return this._remoteCacheEntries;
282
} else {
283
console.error(`Failed to fetch remote embeddings cache from ${remoteCacheURL}`);
284
console.error(`Response status: ${response.status}, status text: ${response.statusText}`);
285
return;
286
}
287
} catch (err) {
288
console.error(`Failed to fetch remote embeddings cache from ${remoteCacheURL}`);
289
console.error(err);
290
return;
291
}
292
}
293
294
protected async fetchRemoteCacheLatest(): Promise<string | undefined> {
295
const remoteCacheLatestUpdateURL = await this.getRemoteCacheLatestUpdateURL();
296
try {
297
const response = await this.fetcherService.fetch(remoteCacheLatestUpdateURL, { method: 'GET', callSite: 'embeddings-remote-cache-latest' });
298
if (response.ok) {
299
return response.text();
300
} else {
301
console.error(`Failed to fetch remote embeddings cache from ${remoteCacheLatestUpdateURL}`);
302
console.error(`Response status: ${response.status}, status text: ${response.statusText}`);
303
return;
304
}
305
} catch (err) {
306
console.error(`Failed to fetch remote embeddings cache from ${remoteCacheLatestUpdateURL}`);
307
console.error(err);
308
return;
309
}
310
}
311
312
public async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {
313
const remoteCacheLatest = await this.fetchRemoteCacheLatest();
314
const cache = await this.embeddingsCache.getCache();
315
// If the cache exists and the remote cache version is a match,
316
// it means it is the latest version and we can return it,
317
// otherwise we will fetch again the remote cache
318
if (cache && remoteCacheLatest === this.embeddingsCache.cacheVersionMementoStorage.get<string>(this.remoteCacheVersionKey)) {
319
return cache as T;
320
}
321
const remoteCache = await this.fetchRemoteCache();
322
if (remoteCache === undefined) {
323
// fallback to previous local cache if remote cache is unavailable
324
return cache as T;
325
}
326
327
await this.embeddingsCache.clearCache();
328
await this.embeddingsCache.cacheVersionMementoStorage.update(this.remoteCacheVersionKey, remoteCacheLatest);
329
await this.embeddingsCache.updateCache(remoteCache);
330
return remoteCache as T;
331
}
332
333
static calculateRemoteCDNURL(cacheContainer: RemoteEmbeddingsContainer, embeddingsType: RemoteCacheType, cacheVersion: string): string {
334
return `https://embeddings.vscode-cdn.net/${cacheContainer}/v${cacheVersion}/${embeddingsType}/core.json`;
335
}
336
337
static calculateRemoteCDNLatestURL(cacheContainer: RemoteEmbeddingsContainer, embeddingsType: RemoteCacheType, cacheVersion: string): string {
338
return `https://embeddings.vscode-cdn.net/${cacheContainer}/v${cacheVersion}/${embeddingsType}/latest.txt`;
339
}
340
}
341
342
/**
343
* A remote cache which is also aware of installed extensions and updates properly when they are updated, installed, or uninstalled
344
* Internally we use a nested structure which breaks down core, and each extension id for better perf.
345
* Externally a flattened cache with all values on the same level is exposed for easier consumption and to conform to the other cache interfaces.
346
* When updating the cache we use the internal structure rather than the flatten one because the flattened on is only for external consumption.
347
*/
348
export class RemoteEmbeddingsExtensionCache extends RemoteEmbeddingsCache {
349
// This is a nested structure used to help us do just patching of updated extensions
350
private _remoteExtensionCache: EmbeddingCacheEntriesWithExtensions | undefined;
351
private _baseExtensionCDNURL: string | undefined;
352
353
constructor(
354
cacheType: EmbeddingCacheType,
355
cacheKey: string,
356
cacheVersion: string,
357
embeddingType: EmbeddingType,
358
remoteCacheType: RemoteCacheType,
359
@IFetcherService fetcher: IFetcherService,
360
@IWorkbenchService private readonly workbenchService: IWorkbenchService,
361
@IInstantiationService instantiationService: IInstantiationService,
362
) {
363
super(cacheType, cacheKey, cacheVersion, embeddingType, remoteCacheType, fetcher, instantiationService);
364
}
365
366
private async getBaseExtensionCDNURL(): Promise<string> {
367
if (!this._baseExtensionCDNURL) {
368
const remoteCacheContainer = await this.getRemoteContainer();
369
this._baseExtensionCDNURL = RemoteEmbeddingsExtensionCache.calculateBaseRemoteExtensionCDNURL(remoteCacheContainer, this.remoteCacheType, this.cacheVersion);
370
}
371
return this._baseExtensionCDNURL!;
372
}
373
374
private constructExposedCache(): EmbeddingCacheEntries | undefined {
375
if (!this._remoteExtensionCache) {
376
return;
377
}
378
const flattenedCache: EmbeddingCacheEntries = { ...this._remoteExtensionCache.core };
379
for (const extensionId in this._remoteExtensionCache.extensions) {
380
const extensionCache = this._remoteExtensionCache.extensions[extensionId];
381
for (const key in extensionCache) {
382
flattenedCache[key] = extensionCache[key];
383
}
384
}
385
return flattenedCache;
386
}
387
388
private async fetchRemoteExtensionCache(extensionId: string): Promise<EmbeddingCacheEntries | undefined> {
389
const baseExtensionCDNURL = await this.getBaseExtensionCDNURL();
390
const extensionUrl = `${baseExtensionCDNURL}/${extensionId}.json`;
391
try {
392
const response = await this.fetcherService.fetch(extensionUrl, { method: 'GET', callSite: 'embeddings-extension-cache' });
393
if (response.ok) {
394
return (await response.json()) as EmbeddingCacheEntries;
395
} else {
396
if (response.status === 404) {
397
// The file doesn't exist on our CDN return an empty object so we don't try to fetch it again
398
return {};
399
}
400
console.error(`Failed to fetch remote embeddings cache from ${extensionUrl}`);
401
console.error(`Response status: ${response.status}, status text: ${response.statusText}`);
402
return;
403
}
404
} catch (err) {
405
console.error(`Failed to fetch remote embeddings cache from ${extensionUrl}`);
406
console.error(err);
407
return;
408
}
409
}
410
411
public override async getCache<T = EmbeddingCacheEntries>(): Promise<T | undefined> {
412
const coreOrLocalCache = await super.getCache<EmbeddingCacheEntries | EmbeddingCacheEntriesWithExtensions>();
413
// The remote cache for core coming back unavaiable indicates request problems so we cannot continue with fetching extensions
414
if (coreOrLocalCache === undefined) {
415
return;
416
}
417
let currentCache: EmbeddingCacheEntriesWithExtensions = { core: {}, extensions: {} };
418
// Check if the cache has a property 'core' as the RemoteCachewithExtensions has it
419
if (
420
coreOrLocalCache &&
421
RemoteEmbeddingsExtensionCache.isEmbeddingsCacheEntriesWithExtensions(coreOrLocalCache)
422
) {
423
currentCache = coreOrLocalCache;
424
} else {
425
currentCache = { core: coreOrLocalCache, extensions: {} };
426
}
427
428
const activatedExtensionIds = RemoteEmbeddingsExtensionCache.getInstalledExtensionIds(this.workbenchService);
429
let removedExtensions = false;
430
// Remove any extensions from currentCache which aren't in activatedExtensionIds
431
for (const extensionId in currentCache.extensions) {
432
if (!activatedExtensionIds.includes(extensionId)) {
433
delete currentCache.extensions[extensionId];
434
removedExtensions = true;
435
}
436
}
437
const extensionIdsToFetch = activatedExtensionIds.filter(
438
id => !(id in currentCache.extensions) || currentCache.extensions[id] === undefined
439
);
440
441
for (const extensionId of extensionIdsToFetch) {
442
const extensionCache = await this.fetchRemoteExtensionCache(extensionId);
443
if (extensionCache) {
444
currentCache.extensions[extensionId] = extensionCache;
445
}
446
}
447
448
this._remoteExtensionCache = currentCache;
449
if (extensionIdsToFetch.length > 0 || removedExtensions) {
450
await this.embeddingsCache.clearCache();
451
await this.embeddingsCache.updateCache(currentCache);
452
}
453
454
return this.constructExposedCache() as T;
455
}
456
457
static isEmbeddingsCacheEntriesWithExtensions(obj: any): obj is EmbeddingCacheEntriesWithExtensions {
458
return 'core' in obj && 'extensions' in obj;
459
}
460
461
static getInstalledExtensionIds(workbenchService: IWorkbenchService): string[] {
462
return workbenchService.getAllExtensions().filter(e => !e.id.startsWith('vscode')).map(e => e.id);
463
}
464
465
static calculateBaseRemoteExtensionCDNURL(cacheContainer: RemoteEmbeddingsContainer, embeddingsType: RemoteCacheType, cacheVersion: string): string {
466
return `https://embeddings.vscode-cdn.net/${cacheContainer}/v${cacheVersion}/${embeddingsType}`;
467
}
468
}
469
470
export abstract class BaseEmbeddingsIndex<V extends { key: string; embedding?: EmbeddingVector }>
471
implements EmbeddingsIndex<string, V> {
472
protected _items: Map<string, V>;
473
private _isIndexLoaded = false;
474
private _calculationPromise: Promise<void> | undefined;
475
476
constructor(
477
loggerContext: string,
478
private readonly embeddingType: EmbeddingType,
479
private readonly cacheKey: string,
480
private readonly _embeddingsCache: IEmbeddingsCache,
481
protected readonly embeddingsComputer: IEmbeddingsComputer,
482
protected readonly logService: ILogService,
483
) {
484
this._items = new Map<string, V>();
485
}
486
487
public get isIndexLoaded(): boolean {
488
return this._isIndexLoaded;
489
}
490
491
protected set isIndexLoaded(value: boolean) {
492
this._isIndexLoaded = value;
493
}
494
495
public async rebuildCache() {
496
await this._embeddingsCache.clearCache();
497
this._items.clear();
498
return this.calculateEmbeddings();
499
}
500
501
/**
502
* Finds the n closest values to a given embedding
503
* @param queryEmbedding The embedding to find the n closest values for
504
* @param n The number of closest values to return
505
* @returns The n closest values to the embedding, sorted by similarity. Could be less than n if there are less than n items indexed
506
*/
507
public nClosestValues(queryEmbedding: Embedding, n: number): V[] {
508
return rankEmbeddings(queryEmbedding, Array.from(this._items.values()).filter(x => x.embedding).map(x => [x, { value: x.embedding!, type: this.embeddingType } satisfies Embedding] as const), n)
509
.map(x => x.value);
510
}
511
512
public hasItem(key: string): boolean {
513
return this._items.has(key);
514
}
515
516
public getItem(key: string): V | undefined {
517
return this._items.get(key);
518
}
519
520
public async calculateEmbeddings(): Promise<void> {
521
// This prevents being able to queue many calculations at once since it should always be referring to the same promise
522
if (this._calculationPromise) {
523
return this._calculationPromise;
524
}
525
this._calculationPromise = this._calculateEmbeddings();
526
return this._calculationPromise.then(() => (this._calculationPromise = undefined));
527
}
528
529
private async _calculateEmbeddings(): Promise<void> {
530
const startTime = Date.now();
531
const allItems: V[] = await this.getLatestItems();
532
const cachedEmbeddings = await this._embeddingsCache.getCache();
533
// check that the cached embeddings is of flattened format, if not, we need to construct it
534
const latestEmbeddingsIndex = new Map<string, V>();
535
for (const item of allItems) {
536
let newItem = item;
537
const oldItem = this._items.get(item.key);
538
const key = item.key;
539
// We have it in our current index
540
if (oldItem?.embedding) {
541
newItem = oldItem;
542
} else if (cachedEmbeddings && cachedEmbeddings[key]) {
543
// We have it in our cache
544
newItem = { ...item, ...cachedEmbeddings[key] };
545
}
546
547
latestEmbeddingsIndex.set(key, newItem);
548
}
549
550
this._items = latestEmbeddingsIndex;
551
552
this.logService.debug(`Embeddings for ${this.cacheKey} calculated in ${Date.now() - startTime}ms`);
553
this.isIndexLoaded = true;
554
}
555
556
/**
557
* Converts the value into the string that will be used to calculate the embedding
558
* @param value The value to convert to a natural language query
559
* @returns The natural language query
560
*/
561
protected abstract getEmbeddingQueryString(value: V): string;
562
563
protected abstract getLatestItems(): Promise<V[]>;
564
}
565