Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/embeddings/test/node/embeddingsGrouper.spec.ts
13405 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { beforeEach, describe, expect, it } from 'vitest';
7
import { Embedding, EmbeddingType } from '../../common/embeddingsComputer';
8
import { EmbeddingsGrouper, GroupingOptions, Node } from '../../common/embeddingsGrouper';
9
10
interface TestTool {
11
name: string;
12
category: string;
13
}
14
15
// Helper function to create test embeddings
16
function createEmbedding(values: number[]): Embedding {
17
return {
18
type: EmbeddingType.text3small_512,
19
value: values
20
};
21
}
22
23
// Helper function to create test nodes
24
function createNode(name: string, category: string, embedding: number[]): Node<TestTool> {
25
return {
26
value: { name, category },
27
embedding: createEmbedding(embedding)
28
};
29
}
30
31
// Create embeddings that should cluster together (high cosine similarity)
32
function createSimilarEmbeddings(): number[][] {
33
return [
34
[1, 0.8, 0.2, 0.1], // Similar to group 1
35
[0.9, 0.7, 0.1, 0.15], // Similar to group 1
36
[0.1, 0.2, 1, 0.8], // Similar to group 2
37
[0.15, 0.1, 0.9, 0.7], // Similar to group 2
38
[0.5, 0.5, 0.5, 0.5] // Outlier
39
];
40
}
41
42
describe('EmbeddingsGrouper', () => {
43
let grouper: EmbeddingsGrouper<TestTool>;
44
45
beforeEach(() => {
46
grouper = new EmbeddingsGrouper<TestTool>();
47
});
48
49
describe('constructor and initial state', () => {
50
it('should initialize with empty clusters', () => {
51
expect(grouper.getClusters()).toHaveLength(0);
52
});
53
54
it('should accept custom options', () => {
55
const options: GroupingOptions = {
56
eps: 0.85, // High similarity threshold
57
minClusterSize: 3,
58
insertThreshold: 0.7
59
};
60
const customGrouper = new EmbeddingsGrouper<TestTool>(options);
61
expect(customGrouper.getClusters()).toHaveLength(0);
62
});
63
});
64
65
describe('addNode', () => {
66
it('should create singleton cluster for first node', () => {
67
const node = createNode('tool1', 'category1', [1, 0, 0, 0]);
68
grouper.addNode(node);
69
70
const clusters = grouper.getClusters();
71
expect(clusters).toHaveLength(1);
72
expect(clusters[0].nodes).toHaveLength(1);
73
expect(clusters[0].nodes[0]).toBe(node);
74
});
75
76
it('should create separate clusters for dissimilar nodes', () => {
77
const node1 = createNode('tool1', 'category1', [1, 0, 0, 0]);
78
const node2 = createNode('tool2', 'category2', [0, 1, 0, 0]);
79
80
grouper.addNode(node1);
81
grouper.addNode(node2);
82
83
const clusters = grouper.getClusters();
84
expect(clusters).toHaveLength(2);
85
expect(clusters.every(c => c.nodes.length === 1)).toBe(true);
86
});
87
88
it('should add similar nodes to existing clusters when possible', () => {
89
const embeddings = createSimilarEmbeddings();
90
const nodes = [
91
createNode('tool1', 'category1', embeddings[0]),
92
createNode('tool2', 'category1', embeddings[1])
93
];
94
95
grouper.addNode(nodes[0]);
96
grouper.addNode(nodes[1]);
97
98
// After adding similar nodes incrementally, they might not cluster
99
// Let's force clustering to see the behavior
100
grouper.recluster();
101
102
const clusters = grouper.getClusters();
103
expect(clusters.length).toBeGreaterThan(0);
104
105
// Similar embeddings should be in same cluster
106
const cluster = grouper.getClusterForNode(nodes[0]);
107
expect(cluster).toBeDefined();
108
expect(cluster!.nodes.some(n => n === nodes[1])).toBe(true);
109
});
110
});
111
112
describe('addNodes (bulk)', () => {
113
it('should handle empty array efficiently', () => {
114
grouper.addNodes([]);
115
expect(grouper.getClusters()).toHaveLength(0);
116
});
117
118
it('should add multiple nodes and cluster them efficiently', () => {
119
const embeddings = createSimilarEmbeddings();
120
const nodes = [
121
createNode('search', 'lookup', embeddings[0]),
122
createNode('find', 'lookup', embeddings[1]), // Similar to search
123
createNode('create', 'generate', embeddings[2]),
124
createNode('make', 'generate', embeddings[3]), // Similar to create
125
createNode('random', 'misc', embeddings[4]) // Outlier
126
];
127
128
grouper.addNodes(nodes);
129
130
const clusters = grouper.getClusters();
131
expect(clusters.length).toBeGreaterThanOrEqual(2);
132
133
// Verify all nodes are assigned
134
const totalNodesInClusters = clusters.reduce((sum, cluster) => sum + cluster.nodes.length, 0);
135
expect(totalNodesInClusters).toBe(nodes.length);
136
});
137
138
it('should allow deferring clustering with reclusterAfter=false', () => {
139
const nodes = [
140
createNode('tool1', 'cat1', [1, 0, 0, 0]),
141
createNode('tool2', 'cat1', [0.9, 0.1, 0, 0]),
142
createNode('tool3', 'cat1', [0, 1, 0, 0])
143
];
144
145
grouper.addNodes(nodes, false);
146
147
// Should create singleton clusters without clustering
148
const clusters = grouper.getClusters();
149
expect(clusters).toHaveLength(3);
150
expect(clusters.every(c => c.nodes.length === 1)).toBe(true);
151
152
// Manual recluster should group similar nodes
153
grouper.recluster();
154
const clustersAfterRecluster = grouper.getClusters();
155
// Should potentially reduce cluster count due to grouping
156
expect(clustersAfterRecluster.length).toBeLessThanOrEqual(clusters.length);
157
});
158
});
159
160
describe('removeNode', () => {
161
it('should return false for non-existent node', () => {
162
const node = createNode('tool1', 'category1', [1, 0, 0, 0]);
163
const result = grouper.removeNode(node);
164
expect(result).toBe(false);
165
});
166
167
it('should remove node and return true for existing node', () => {
168
const node = createNode('tool1', 'category1', [1, 0, 0, 0]);
169
grouper.addNode(node);
170
171
const result = grouper.removeNode(node);
172
expect(result).toBe(true);
173
expect(grouper.getClusters()).toHaveLength(0);
174
});
175
176
it('should remove empty clusters when last node is removed', () => {
177
const node = createNode('tool1', 'category1', [1, 0, 0, 0]);
178
grouper.addNode(node);
179
expect(grouper.getClusters()).toHaveLength(1);
180
181
grouper.removeNode(node);
182
expect(grouper.getClusters()).toHaveLength(0);
183
});
184
185
it('should update cluster when node is removed from multi-node cluster', () => {
186
const embeddings = createSimilarEmbeddings();
187
const nodes = [
188
createNode('tool1', 'category1', embeddings[0]),
189
createNode('tool2', 'category1', embeddings[1]),
190
createNode('tool3', 'category1', embeddings[0]) // Very similar to first
191
];
192
193
nodes.forEach(node => grouper.addNode(node));
194
grouper.recluster();
195
196
const initialClusters = grouper.getClusters();
197
const clusterWithMultipleNodes = initialClusters.find(c => c.nodes.length > 1);
198
199
if (clusterWithMultipleNodes && clusterWithMultipleNodes.nodes.length > 1) {
200
const nodeToRemove = clusterWithMultipleNodes.nodes[0];
201
grouper.removeNode(nodeToRemove);
202
203
const updatedCluster = grouper.getClusterForNode(clusterWithMultipleNodes.nodes[1]);
204
expect(updatedCluster).toBeDefined();
205
expect(updatedCluster!.nodes.some(n => n === nodeToRemove)).toBe(false);
206
}
207
});
208
});
209
210
describe('recluster', () => {
211
it('should handle empty node list', () => {
212
grouper.recluster();
213
expect(grouper.getClusters()).toHaveLength(0);
214
});
215
216
it('should create single cluster for single node', () => {
217
const node = createNode('tool1', 'category1', [1, 0, 0, 0]);
218
grouper.addNode(node);
219
grouper.recluster();
220
221
const clusters = grouper.getClusters();
222
expect(clusters).toHaveLength(1);
223
expect(clusters[0].nodes).toHaveLength(1);
224
});
225
226
it('should group similar embeddings together', () => {
227
const embeddings = createSimilarEmbeddings();
228
const nodes = [
229
createNode('search', 'lookup', embeddings[0]),
230
createNode('find', 'lookup', embeddings[1]), // Similar to search
231
createNode('create', 'generate', embeddings[2]),
232
createNode('make', 'generate', embeddings[3]), // Similar to create
233
createNode('random', 'misc', embeddings[4]) // Outlier
234
];
235
236
nodes.forEach(node => grouper.addNode(node));
237
grouper.recluster();
238
239
const clusters = grouper.getClusters();
240
expect(clusters.length).toBeGreaterThanOrEqual(2);
241
242
// Find clusters for similar nodes
243
const searchCluster = grouper.getClusterForNode(nodes[0]);
244
const findCluster = grouper.getClusterForNode(nodes[1]);
245
const createCluster = grouper.getClusterForNode(nodes[2]);
246
const makeCluster = grouper.getClusterForNode(nodes[3]);
247
248
// Similar nodes should be in same clusters
249
expect(searchCluster).toBeDefined();
250
expect(findCluster).toBeDefined();
251
expect(createCluster).toBeDefined();
252
expect(makeCluster).toBeDefined();
253
});
254
255
it('should respect minimum cluster size option', () => {
256
const grouper = new EmbeddingsGrouper<TestTool>({ minClusterSize: 3 });
257
const embeddings = createSimilarEmbeddings();
258
const nodes = [
259
createNode('tool1', 'cat1', embeddings[0]),
260
createNode('tool2', 'cat1', embeddings[1])
261
];
262
263
nodes.forEach(node => grouper.addNode(node));
264
grouper.recluster();
265
266
// With minClusterSize=3, these 2 similar nodes should be separate singletons
267
const clusters = grouper.getClusters();
268
expect(clusters.every(c => c.nodes.length < 3)).toBe(true);
269
});
270
});
271
272
describe('getClusterForNode', () => {
273
it('should return undefined for non-existent node', () => {
274
const node = createNode('tool1', 'category1', [1, 0, 0, 0]);
275
const cluster = grouper.getClusterForNode(node);
276
expect(cluster).toBeUndefined();
277
});
278
279
it('should return correct cluster for existing node', () => {
280
const node = createNode('tool1', 'category1', [1, 0, 0, 0]);
281
grouper.addNode(node);
282
283
const cluster = grouper.getClusterForNode(node);
284
expect(cluster).toBeDefined();
285
expect(cluster!.nodes).toContain(node);
286
});
287
});
288
289
describe('clustering quality', () => {
290
it('should handle identical embeddings', () => {
291
const identicalEmbedding = [1, 0, 0, 0];
292
const nodes = [
293
createNode('tool1', 'cat1', identicalEmbedding),
294
createNode('tool2', 'cat1', identicalEmbedding),
295
createNode('tool3', 'cat1', identicalEmbedding)
296
];
297
298
nodes.forEach(node => grouper.addNode(node));
299
grouper.recluster();
300
301
// All identical embeddings should be in same cluster
302
const clusters = grouper.getClusters();
303
expect(clusters).toHaveLength(1);
304
expect(clusters[0].nodes).toHaveLength(3);
305
});
306
307
it('should handle zero vectors', () => {
308
const zeroEmbedding = [0, 0, 0, 0];
309
const node = createNode('tool1', 'cat1', zeroEmbedding);
310
grouper.addNode(node);
311
grouper.recluster();
312
313
const clusters = grouper.getClusters();
314
expect(clusters).toHaveLength(1);
315
expect(clusters[0].centroid).toEqual([0, 0, 0, 0]);
316
});
317
318
it('should handle varied similarity distributions', () => {
319
// Create embeddings with different similarity levels
320
const nodes = [
321
createNode('very_similar_1', 'cat1', [1, 0.1, 0, 0]),
322
createNode('very_similar_2', 'cat1', [0.9, 0.1, 0, 0]),
323
createNode('somewhat_similar', 'cat1', [0.7, 0.3, 0.1, 0]),
324
createNode('different_1', 'cat2', [0, 0, 1, 0.1]),
325
createNode('different_2', 'cat2', [0.1, 0, 0.9, 0.1]),
326
createNode('outlier', 'cat3', [0.25, 0.25, 0.25, 0.25])
327
];
328
329
nodes.forEach(node => grouper.addNode(node));
330
grouper.recluster();
331
332
const clusters = grouper.getClusters();
333
expect(clusters.length).toBeGreaterThan(1);
334
expect(clusters.length).toBeLessThanOrEqual(nodes.length);
335
336
// Verify each node is assigned to exactly one cluster
337
const allNodesInClusters = clusters.flatMap(c => c.nodes);
338
expect(allNodesInClusters).toHaveLength(nodes.length);
339
});
340
});
341
342
describe('adaptive threshold behavior', () => {
343
it('should work with different similarity percentiles', () => {
344
const strictGrouper = new EmbeddingsGrouper<TestTool>({ eps: 0.95 }); // High similarity required
345
const lenientGrouper = new EmbeddingsGrouper<TestTool>({ eps: 0.8 }); // Lower similarity required
346
347
const embeddings = createSimilarEmbeddings();
348
const nodes = embeddings.map((emb, i) =>
349
createNode(`tool${i}`, 'category', emb)
350
);
351
352
// Add same nodes to both groupers
353
nodes.forEach(node => {
354
strictGrouper.addNode({ ...node });
355
lenientGrouper.addNode({ ...node });
356
});
357
358
strictGrouper.recluster();
359
lenientGrouper.recluster();
360
361
const strictClusters = strictGrouper.getClusters();
362
const lenientClusters = lenientGrouper.getClusters();
363
364
// Stricter threshold should generally create more clusters
365
expect(strictClusters.length).toBeGreaterThanOrEqual(lenientClusters.length);
366
});
367
});
368
369
describe('centroid computation', () => {
370
it('should compute correct centroids for clusters', () => {
371
const nodes = [
372
createNode('tool1', 'cat1', [1, 0, 0, 0]),
373
createNode('tool2', 'cat1', [0, 1, 0, 0])
374
];
375
376
nodes.forEach(node => grouper.addNode(node));
377
grouper.recluster();
378
379
const clusters = grouper.getClusters();
380
clusters.forEach(cluster => {
381
expect(cluster.centroid).toBeDefined();
382
expect(cluster.centroid.length).toBeGreaterThan(0);
383
384
// Centroid should be normalized (magnitude ≈ 1 for non-zero vectors)
385
const magnitude = Math.sqrt(cluster.centroid.reduce((sum, val) => sum + val * val, 0));
386
if (magnitude > 0) {
387
expect(magnitude).toBeCloseTo(1, 5);
388
}
389
});
390
});
391
});
392
393
describe('threshold tuning optimization', () => {
394
it('should find optimal percentile for target cluster count', () => {
395
const embeddings = createSimilarEmbeddings();
396
const nodes = [
397
createNode('search', 'lookup', embeddings[0]),
398
createNode('find', 'lookup', embeddings[1]), // Similar to search
399
createNode('create', 'generate', embeddings[2]),
400
createNode('make', 'generate', embeddings[3]), // Similar to create
401
createNode('random1', 'misc', embeddings[4]), // Outlier
402
createNode('random2', 'misc', [0.3, 0.3, 0.3, 0.1]), // Another outlier
403
createNode('random3', 'misc', [0.1, 0.3, 0.3, 0.3]) // Another outlier
404
];
405
406
grouper.addNodes(nodes, false); // Don't cluster initially
407
408
// Find optimal percentile for max 4 clusters
409
const result = grouper.tuneThresholdForTargetClusters(4);
410
411
expect(result.clusterCount).toBeLessThanOrEqual(4);
412
expect(result.percentile).toBeGreaterThanOrEqual(80);
413
expect(result.percentile).toBeLessThanOrEqual(99);
414
expect(result.threshold).toBeGreaterThan(0);
415
});
416
417
it('should apply percentile and recluster efficiently', () => {
418
const embeddings = createSimilarEmbeddings();
419
const nodes = embeddings.map((emb, i) =>
420
createNode(`tool${i}`, 'category', emb)
421
);
422
423
grouper.addNodes(nodes);
424
425
// Apply a stricter percentile (should create more clusters)
426
grouper.applyPercentileAndRecluster(98);
427
const strictClusterCount = grouper.getClusters().length;
428
429
// Apply a more lenient percentile (should create fewer clusters)
430
grouper.applyPercentileAndRecluster(85);
431
const lenientClusterCount = grouper.getClusters().length;
432
433
// Stricter threshold should generally create more or equal clusters
434
expect(strictClusterCount).toBeGreaterThanOrEqual(lenientClusterCount);
435
436
// All nodes should still be assigned
437
const allClusters = grouper.getClusters();
438
const totalNodes = allClusters.reduce((sum, cluster) => sum + cluster.nodes.length, 0);
439
expect(totalNodes).toBe(nodes.length);
440
});
441
442
it('should cache similarities for efficient repeated tuning', () => {
443
// Create embeddings with more predictable clustering behavior
444
const nodes: Node<TestTool>[] = [];
445
446
// Create 3 distinct groups of 10 nodes each using deterministic trigonometric patterns
447
for (let group = 0; group < 3; group++) {
448
for (let i = 0; i < 10; i++) {
449
// Each group occupies a different region of the 4D space using trigonometry
450
const baseAngle = group * (2 * Math.PI / 3); // 120° separation between groups
451
const variation = i * 0.1; // Small variation within group
452
453
const embedding = [
454
Math.cos(baseAngle + variation) * 0.8 + 0.2,
455
Math.sin(baseAngle + variation) * 0.8 + 0.2,
456
Math.cos(baseAngle * 2 + variation * 0.5) * 0.3 + 0.1,
457
Math.sin(baseAngle * 2 + variation * 0.5) * 0.3 + 0.1
458
];
459
nodes.push(createNode(`tool${group}_${i}`, `cat${group}`, embedding));
460
}
461
}
462
463
grouper.addNodes(nodes, false);
464
465
const result1 = grouper.tuneThresholdForTargetClusters(15); // Very lenient
466
const result2 = grouper.tuneThresholdForTargetClusters(8); // Moderate
467
const result3 = grouper.tuneThresholdForTargetClusters(5); // Strict
468
469
// With deterministic trigonometric data, these should be more predictable
470
expect(result1.clusterCount).toBeLessThanOrEqual(15);
471
expect(result2.clusterCount).toBeLessThanOrEqual(8);
472
473
// For the strictest case, we have 3 natural groups, so expect something reasonable
474
expect(result3.clusterCount).toBeLessThanOrEqual(5);
475
476
// Verify the algorithm works in the right direction (more restrictive = fewer clusters)
477
expect(result1.clusterCount).toBeGreaterThanOrEqual(result2.clusterCount);
478
expect(result2.clusterCount).toBeGreaterThanOrEqual(result3.clusterCount);
479
}); it('should handle edge cases in threshold tuning', () => {
480
// Empty grouper
481
const emptyResult = grouper.tuneThresholdForTargetClusters(5);
482
expect(emptyResult.clusterCount).toBe(0);
483
484
// Single node
485
grouper.addNode(createNode('single', 'cat', [1, 0, 0, 0]));
486
const singleResult = grouper.tuneThresholdForTargetClusters(5);
487
expect(singleResult.clusterCount).toBe(1);
488
489
// Target higher than possible clusters
490
const nodes = [
491
createNode('tool1', 'cat1', [1, 0, 0, 0]),
492
createNode('tool2', 'cat1', [0, 1, 0, 0])
493
];
494
grouper.addNodes(nodes);
495
const highTargetResult = grouper.tuneThresholdForTargetClusters(10);
496
expect(highTargetResult.clusterCount).toBeLessThanOrEqual(3); // At most 3 nodes total
497
});
498
});
499
500
describe('edge cases', () => {
501
it('should handle single dimension embeddings', () => {
502
const nodes = [
503
createNode('tool1', 'cat1', [1]),
504
createNode('tool2', 'cat1', [0.9]),
505
createNode('tool3', 'cat1', [0.1])
506
];
507
508
nodes.forEach(node => grouper.addNode(node));
509
grouper.recluster();
510
511
const clusters = grouper.getClusters();
512
expect(clusters.length).toBeGreaterThan(0);
513
});
514
515
it('should handle large number of nodes efficiently', () => {
516
const nodes: Node<TestTool>[] = [];
517
for (let i = 0; i < 100; i++) {
518
// Create diverse embeddings that will form distinct clusters
519
const groupId = Math.floor(i / 10); // 10 groups of 10 nodes each
520
const withinGroupVariation = (i % 10) * 0.1;
521
522
// Create embeddings with clear group separation
523
const embedding = [
524
groupId === 0 ? 1 - withinGroupVariation * 0.2 : withinGroupVariation * 0.2,
525
groupId === 1 ? 1 - withinGroupVariation * 0.2 : withinGroupVariation * 0.2,
526
groupId === 2 ? 1 - withinGroupVariation * 0.2 : withinGroupVariation * 0.2,
527
groupId >= 3 ? 1 - withinGroupVariation * 0.2 : withinGroupVariation * 0.2
528
];
529
nodes.push(createNode(`tool${i}`, `cat${groupId}`, embedding));
530
}
531
532
nodes.forEach(node => grouper.addNode(node));
533
grouper.recluster();
534
535
const clusters = grouper.getClusters();
536
// Should create multiple clusters but not more than the number of nodes
537
expect(clusters.length).toBeGreaterThanOrEqual(2);
538
expect(clusters.length).toBeLessThanOrEqual(nodes.length);
539
540
// Verify all nodes are assigned
541
const totalNodesInClusters = clusters.reduce((sum, cluster) => sum + cluster.nodes.length, 0);
542
expect(totalNodesInClusters).toBe(nodes.length);
543
});
544
});
545
});
546
547
548
549