Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/script/compareStestAlternativeRuns.ts
13383 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
/* eslint-disable local/code-no-dangerous-type-assertions */
6
7
import { AssertionError } from 'assert';
8
import { execFile } from 'child_process';
9
import { promises as fs } from 'fs';
10
import * as path from 'path';
11
12
/**
13
* An entry from `baseline.json`.
14
*/
15
interface BaselineTestResult {
16
/** Test name */
17
name: string;
18
score: number;
19
passCount: number;
20
failCount: number;
21
contentFilterCount: number;
22
attributes: (Record<string, string | number> & { ['CompScore1']: number | undefined } & { ['CompScore2']: number | undefined } & { ['CompScore3']: number | undefined });
23
}
24
25
enum SignalKind {
26
OldFormat = 'OldFormat',
27
MustHave = 'MustHave',
28
NiceToHave = 'NiceToHave',
29
BadSuggestions = 'BadSuggestions',
30
Other = 'Other',
31
}
32
33
namespace SignalKind {
34
export function getFromTestName(testName: string): SignalKind | undefined {
35
const signalKindRe = `^\\[(${Object.values(SignalKind).join('|')})\\]`;
36
const signalKind = testName.match(signalKindRe);
37
if (signalKind) {
38
return Object.values(SignalKind).includes(signalKind[1] as SignalKind) ? signalKind[1] as SignalKind : undefined;
39
}
40
}
41
}
42
43
interface TestResult {
44
/** unflavored */
45
name: string;
46
signalKind: SignalKind | undefined;
47
testResults: BaselineTestResult[];
48
compScore1: number | undefined;
49
compScore2: number | undefined;
50
compScore3: number | undefined;
51
}
52
53
const regexForProviderName = / \(\[(([a-zA-Z0-9\-])+)\]\)/;
54
const DEFAULT_PROVIDER_NAME = 'Default Provider';
55
56
function getFlavor(testResult: BaselineTestResult): string {
57
const match = testResult.name.match(regexForProviderName);
58
if (match) {
59
switch (match[1]) {
60
case 'prodFineTunedModel': return 'NES';
61
case 'prodFineTunedModelWithSummarizedDocument': return 'NES-summ';
62
case 'speculativeEditingInlineEditProvider': return 'SpecEdit';
63
default:
64
return match[1];
65
}
66
} else {
67
return DEFAULT_PROVIDER_NAME;
68
}
69
}
70
71
function computeTestResultsFromBaseline(baseline: BaselineTestResult[]): TestResult[] {
72
73
const nesTestsWithFlavor = baseline.filter((currentBaselineTestResult) =>
74
currentBaselineTestResult.name.startsWith('NES ') || (currentBaselineTestResult.name.startsWith('InlineEdit') && currentBaselineTestResult.name.includes('])')));
75
76
const fullNameToTestName = (fullName: string) => {
77
const indexOfSuiteTestNameSplit = fullName.indexOf(' - ');
78
const testName = fullName.slice(indexOfSuiteTestNameSplit + 3);
79
if (testName === undefined) { throw new AssertionError({ message: `does not follow the expected pattern: ${fullName}` }); }
80
return testName;
81
};
82
83
const testNameToResults = new Map<string, BaselineTestResult[]>();
84
85
for (const nesTest of nesTestsWithFlavor) {
86
const testName = fullNameToTestName(nesTest.name);
87
const baselineTestResults = testNameToResults.get(testName) ?? [];
88
baselineTestResults.push(nesTest);
89
testNameToResults.set(testName, baselineTestResults);
90
}
91
92
const sortedTestNameToFlavor = Array.from(testNameToResults.entries());
93
94
sortedTestNameToFlavor.sort((a, b) => {
95
const aTestName = a[0];
96
const bTestName = b[0];
97
return aTestName.localeCompare(bTestName);
98
});
99
100
return sortedTestNameToFlavor.map(([testName, baselineTestResults]) => {
101
return {
102
name: testName,
103
signalKind: SignalKind.getFromTestName(testName),
104
testResults: baselineTestResults,
105
compScore1: baselineTestResults[0]?.attributes?.CompScore1 as number | undefined,
106
compScore2: baselineTestResults[0]?.attributes?.CompScore2 as number | undefined,
107
compScore3: baselineTestResults[0]?.attributes?.CompScore3 as number | undefined,
108
} satisfies TestResult;
109
});
110
}
111
112
function formatAsBold(text: string) {
113
return `${text} *`;
114
}
115
116
function formatAsColored(text: string, color: 'green' | 'violet' | 'red' | undefined) {
117
if (!color) {
118
return text;
119
}
120
const colorMap = {
121
'green': 32,
122
'red': 31,
123
'violet': 35,
124
};
125
return `\x1b[${colorMap[color]}m${text}\x1b[0m`;
126
}
127
128
// For BadSuggestion tests, a score > 0 is considered a pass, otherwise a fail
129
function isBadSuggestionPassed(score: number): boolean {
130
return score > 0;
131
}
132
133
// Format pass ratio as a percentage string
134
function formatPassRatio(passed: number, total: number): string {
135
if (total === 0) {
136
return '0.00%';
137
}
138
return `${((passed / total) * 100).toFixed(2)}%`;
139
}
140
141
type TestScoreByFlavor = Record<string /* flavor */, number | { oldScore: number; newScore: number } | undefined>;
142
type AggregatedTest = { test: string; scores: TestScoreByFlavor; signalKind?: SignalKind };
143
144
function printTable(data: AggregatedTest[], { compare, useColoredOutput, filterProviders, omitEqual }: { compare: boolean; useColoredOutput: boolean; filterProviders?: string[]; omitEqual: boolean }) {
145
const providers = Array.from(new Set(data.flatMap(d => Object.keys(d.scores))));
146
const filteredProviders = filterProviders ? providers.filter(provider => filterProviders.includes(provider.toLocaleLowerCase())) : providers;
147
148
const aggregatedTestsBySignalKind = data.reduce((acc: Record<SignalKind, AggregatedTest[]>, item) => {
149
const group = item.signalKind ?? SignalKind.Other;
150
if (!acc[group]) {
151
acc[group] = [];
152
}
153
acc[group].push(item);
154
return acc;
155
}, {} as Record<SignalKind, AggregatedTest[]>);
156
157
const tableData: Record<string, string>[] = [];
158
159
const totalScoreByProvider: Record<string, number> = {};
160
const oldTotalScoreByProvider: Record<string, number> = {};
161
162
// Track pass/fail counts for BadSuggestion tests
163
const badSuggestionPassedByProvider: Record<string, number> = {};
164
const badSuggestionTotalByProvider: Record<string, number> = {};
165
const oldBadSuggestionPassedByProvider: Record<string, number> = {};
166
167
for (const provider of filteredProviders) {
168
totalScoreByProvider[provider] = 0;
169
oldTotalScoreByProvider[provider] = 0;
170
badSuggestionPassedByProvider[provider] = 0;
171
badSuggestionTotalByProvider[provider] = 0;
172
oldBadSuggestionPassedByProvider[provider] = 0;
173
}
174
175
// Iterate over each signal kind
176
for (const [signalKind, tests] of Object.entries(aggregatedTestsBySignalKind)) {
177
// add header
178
tableData.push({ 'Test Name': `=== ${signalKind} ===` });
179
180
const totalByProviderForSignalKind: Record<string /* provider */, number> = {};
181
const oldTotalByProviderForSignalKind: Record<string /* provider */, number> = {};
182
183
// Track pass/fail counts for BadSuggestion tests within this signal kind
184
const badSuggestionPassedByProviderForSignalKind: Record<string, number> = {};
185
const badSuggestionTotalByProviderForSignalKind: Record<string, number> = {};
186
const oldBadSuggestionPassedByProviderForSignalKind: Record<string, number> = {};
187
188
for (const provider of filteredProviders) {
189
totalByProviderForSignalKind[provider] = 0;
190
oldTotalByProviderForSignalKind[provider] = 0;
191
badSuggestionPassedByProviderForSignalKind[provider] = 0;
192
badSuggestionTotalByProviderForSignalKind[provider] = 0;
193
oldBadSuggestionPassedByProviderForSignalKind[provider] = 0;
194
}
195
196
const isBadSuggestionCategory = signalKind === SignalKind.BadSuggestions;
197
198
for (const test of tests) {
199
const scores = filteredProviders.map(provider => {
200
const score = test.scores[provider];
201
const oldScore = typeof score === 'object' ? score.oldScore : undefined;
202
const numericScore = typeof score === 'object' ? score.newScore : score ?? 0;
203
204
// Handle BadSuggestion scores differently
205
if (isBadSuggestionCategory) {
206
badSuggestionTotalByProvider[provider]++;
207
badSuggestionTotalByProviderForSignalKind[provider]++;
208
209
if (isBadSuggestionPassed(numericScore)) {
210
badSuggestionPassedByProvider[provider]++;
211
badSuggestionPassedByProviderForSignalKind[provider]++;
212
}
213
214
if (oldScore !== undefined) {
215
if (isBadSuggestionPassed(oldScore)) {
216
oldBadSuggestionPassedByProvider[provider]++;
217
oldBadSuggestionPassedByProviderForSignalKind[provider]++;
218
}
219
}
220
} else {
221
// Regular handling for non-BadSuggestion tests
222
totalByProviderForSignalKind[provider] += numericScore;
223
oldTotalScoreByProvider[provider] += oldScore ?? 0;
224
totalScoreByProvider[provider] += numericScore;
225
oldTotalByProviderForSignalKind[provider] += oldScore ?? 0;
226
}
227
228
return numericScore;
229
});
230
231
const maxScore = Math.max(...scores);
232
const minScore = Math.min(...scores);
233
const areAllScoresEqual = maxScore === minScore;
234
235
if (omitEqual && areAllScoresEqual) {
236
continue;
237
}
238
239
const resultRow: Record<string, string> = { 'Test Name': test.test };
240
for (let i = 0; i < filteredProviders.length; i++) {
241
const provider = filteredProviders[i];
242
const rawScore = test.scores[provider];
243
const score = scores[i];
244
245
let formattedScore: string;
246
247
if (isBadSuggestionCategory) {
248
// For BadSuggestion, show "Pass" or "Fail" instead of score
249
formattedScore = isBadSuggestionPassed(score) ? 'Pass' : 'Fail';
250
251
if (compare && typeof rawScore === 'object') {
252
const oldResult = isBadSuggestionPassed(rawScore.oldScore) ? 'Pass' : 'Fail';
253
const newResult = isBadSuggestionPassed(rawScore.newScore) ? 'Pass' : 'Fail';
254
255
if (oldResult !== newResult) {
256
const color = useColoredOutput ?
257
(oldResult === 'Fail' && newResult === 'Pass' ? 'green' : 'red') :
258
undefined;
259
formattedScore = formatAsColored(`${oldResult} -> ${newResult}`, color);
260
}
261
}
262
} else {
263
// Regular formatting for non-BadSuggestion tests
264
formattedScore = score.toFixed(2);
265
if (compare && typeof rawScore === 'object' && rawScore.oldScore !== rawScore.newScore) {
266
const color = useColoredOutput ? (rawScore.newScore > rawScore.oldScore ? 'green' : 'red') : undefined;
267
formattedScore = formatAsColored(`${rawScore.oldScore.toFixed(2)} -> ${rawScore.newScore.toFixed(2)}`, color);
268
} else if (maxScore - score < 0.001 && !areAllScoresEqual) {
269
formattedScore = formatAsBold(formattedScore);
270
}
271
}
272
273
resultRow[provider] = typeof rawScore === 'undefined' ? '-' : formattedScore;
274
}
275
276
tableData.push(resultRow);
277
}
278
279
// Add subtotal for signal kind
280
const subtotalRow: Record<string, string> = { 'Test Name': `${signalKind} Subtotal (${tests.length} tests)` };
281
for (const provider of filteredProviders) {
282
if (isBadSuggestionCategory) {
283
// For BadSuggestion, show pass ratio
284
const passedTests = badSuggestionPassedByProviderForSignalKind[provider];
285
const totalTests = badSuggestionTotalByProviderForSignalKind[provider];
286
const passRatio = formatPassRatio(passedTests, totalTests);
287
288
if (compare) {
289
const oldPassedTests = oldBadSuggestionPassedByProviderForSignalKind[provider];
290
const oldPassRatio = formatPassRatio(oldPassedTests, totalTests);
291
292
if (oldPassedTests !== passedTests) {
293
const color = useColoredOutput ? (passedTests > oldPassedTests ? 'green' : 'red') : undefined;
294
subtotalRow[provider] = formatAsColored(`${oldPassRatio} -> ${passRatio}`, color);
295
} else {
296
subtotalRow[provider] = passRatio;
297
}
298
} else {
299
subtotalRow[provider] = passRatio;
300
}
301
} else {
302
// Regular handling for non-BadSuggestion categories
303
const oldSubTotal = oldTotalByProviderForSignalKind[provider];
304
const subTotal = totalByProviderForSignalKind[provider];
305
if (compare && Math.abs(oldSubTotal - subTotal) > 0.001 && !provider.startsWith('Comp')) {
306
const rawOut = `${oldSubTotal.toFixed(2)} -> ${subTotal.toFixed(2)}`;
307
const color = useColoredOutput ? (oldSubTotal < subTotal ? 'green' : 'red') : undefined;
308
subtotalRow[provider] = formatAsColored(rawOut, color);
309
} else {
310
subtotalRow[provider] = subTotal.toFixed(2);
311
}
312
}
313
}
314
tableData.push(subtotalRow, { 'Test Name': '' });
315
}
316
317
// Add total (don't include BadSuggestion in the grand total)
318
const totalRow: Record<string, string> = { 'Test Name': 'Grand Total (excluding BadSuggestions)' };
319
for (const provider of filteredProviders) {
320
const oldTotal = oldTotalScoreByProvider[provider];
321
const total = totalScoreByProvider[provider];
322
if (compare && Math.abs(oldTotal - total) > 0.001 && !provider.startsWith('Comp')) {
323
const rawOut = `${oldTotal.toFixed(2)} -> ${total.toFixed(2)}`;
324
const color = useColoredOutput ? (oldTotal < total ? 'green' : 'red') : undefined;
325
totalRow[provider] = formatAsColored(rawOut, color);
326
} else {
327
totalRow[provider] = total.toFixed(2);
328
}
329
}
330
tableData.push(totalRow);
331
332
// Add BadSuggestion aggregate pass ratio
333
const badSuggestionRow: Record<string, string> = { 'Test Name': 'BadSuggestion Pass Ratio' };
334
for (const provider of filteredProviders) {
335
const passedTests = badSuggestionPassedByProvider[provider];
336
const totalTests = badSuggestionTotalByProvider[provider];
337
const passRatio = formatPassRatio(passedTests, totalTests);
338
339
if (compare && totalTests > 0) {
340
const oldPassedTests = oldBadSuggestionPassedByProvider[provider];
341
const oldPassRatio = formatPassRatio(oldPassedTests, totalTests);
342
343
if (oldPassedTests !== passedTests) {
344
const color = useColoredOutput ? (passedTests > oldPassedTests ? 'green' : 'red') : undefined;
345
badSuggestionRow[provider] = formatAsColored(`${oldPassRatio} -> ${passRatio}`, color);
346
} else {
347
badSuggestionRow[provider] = passRatio;
348
}
349
} else {
350
badSuggestionRow[provider] = passRatio;
351
}
352
}
353
tableData.push(badSuggestionRow);
354
355
console.table(tableData);
356
}
357
358
const DEFAULT_BASELINE_JSON_PATH = path.join(__dirname, '../test/simulation/baseline.json');
359
const DEFAULT_BASELINE_OLD_JSON_PATH = path.join(__dirname, '../test/simulation/baseline.old.json');
360
361
async function main() {
362
const args = process.argv.slice(2);
363
const compare = args.includes('--compare');
364
const upgradeBaselineOldJson = args.includes('--upgrade-old-baseline');
365
const useColoredOutput = args.includes('--color');
366
const omitEqual = args.includes('--omit-equal');
367
const filterArg = args.find(arg => arg.startsWith('--filter='));
368
const filterProviders = filterArg ? filterArg.split('=')[1].split(',').map(s => s.toLocaleLowerCase()) : undefined;
369
const externalBaselineArg = args.find(arg => arg.startsWith('--external-baseline='));
370
const externalBaselinePath = externalBaselineArg ? externalBaselineArg.split('=')[1] : undefined;
371
372
// Determine baseline paths
373
const BASELINE_JSON_PATH = externalBaselinePath ? path.resolve(externalBaselinePath) : DEFAULT_BASELINE_JSON_PATH;
374
const BASELINE_OLD_JSON_PATH = path.join(path.dirname(BASELINE_JSON_PATH), 'baseline.old.json');
375
376
let baselineJson: string;
377
try {
378
baselineJson = await fs.readFile(BASELINE_JSON_PATH, 'utf8');
379
} catch (e: unknown) {
380
console.error('Failed to read baseline.json');
381
throw e;
382
}
383
let baseline: BaselineTestResult[];
384
try {
385
baseline = JSON.parse(baselineJson) as BaselineTestResult[];
386
} catch (e: unknown) {
387
console.error('Failed to parse baseline.json');
388
throw e;
389
}
390
391
if (upgradeBaselineOldJson) {
392
const baselineJsonContentsFromHEAD = await new Promise<string>((resolve, reject) => {
393
execFile('git', ['show', `HEAD:${path.relative(process.cwd(), BASELINE_JSON_PATH)}`], (error: Error | null, stdout: string) => {
394
if (error) {
395
reject(error);
396
return;
397
}
398
resolve(stdout);
399
});
400
});
401
await fs.writeFile(BASELINE_OLD_JSON_PATH, baselineJsonContentsFromHEAD);
402
}
403
404
let oldBaseline: BaselineTestResult[] | undefined;
405
if (compare) {
406
let oldBaselineJson: string | undefined;
407
try {
408
oldBaselineJson = await fs.readFile(BASELINE_OLD_JSON_PATH, 'utf8');
409
} catch (e: unknown) {
410
console.error('Failed to read baseline.json');
411
throw e;
412
}
413
try {
414
oldBaseline = JSON.parse(oldBaselineJson) as BaselineTestResult[];
415
} catch (e: unknown) {
416
console.error('Failed to parse baseline.json');
417
throw e;
418
}
419
}
420
421
const testResults = computeTestResultsFromBaseline(baseline);
422
const oldTestResults = compare && oldBaseline ? computeTestResultsFromBaseline(oldBaseline) : undefined;
423
424
const testNameToOldScoresByFlavor = oldTestResults?.reduce((acc: Record<string /* testName */, Record<string /* flavor */, number | undefined>>, testResult) => {
425
acc[testResult.name] = testResult.testResults.reduce((acc, testResult) => {
426
acc[getFlavor(testResult)] = testResult.score;
427
return acc;
428
}, { 'Comp1': testResult.compScore1, 'Comp2': testResult.compScore2, 'Comp3': testResult.compScore3 } as Record<string, number | undefined>);
429
return acc;
430
}, {}) ?? {};
431
432
const result = testResults.map(testResult => {
433
const oldScoresByFlavor = testNameToOldScoresByFlavor[testResult.name] || {};
434
const scores = testResult.testResults.reduce((acc: TestScoreByFlavor, testResult) => {
435
const flavor = getFlavor(testResult);
436
const newScore = testResult.score;
437
const oldScore = oldScoresByFlavor[flavor];
438
acc[flavor] = oldScore === undefined ? newScore : { oldScore, newScore };
439
return acc;
440
}, { 'Comp1': testResult.compScore1, 'Comp2': testResult.compScore2, 'Comp3': testResult.compScore3 });
441
return {
442
test: testResult.name,
443
signalKind: testResult.signalKind,
444
scores,
445
};
446
});
447
448
printTable(result, { compare, useColoredOutput, filterProviders, omitEqual });
449
}
450
451
main();
452
453