Path: blob/main/extensions/copilot/script/compareStestAlternativeRuns.ts
13383 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/4/* eslint-disable local/code-no-dangerous-type-assertions */56import { AssertionError } from 'assert';7import { execFile } from 'child_process';8import { promises as fs } from 'fs';9import * as path from 'path';1011/**12* An entry from `baseline.json`.13*/14interface BaselineTestResult {15/** Test name */16name: string;17score: number;18passCount: number;19failCount: number;20contentFilterCount: number;21attributes: (Record<string, string | number> & { ['CompScore1']: number | undefined } & { ['CompScore2']: number | undefined } & { ['CompScore3']: number | undefined });22}2324enum SignalKind {25OldFormat = 'OldFormat',26MustHave = 'MustHave',27NiceToHave = 'NiceToHave',28BadSuggestions = 'BadSuggestions',29Other = 'Other',30}3132namespace SignalKind {33export function getFromTestName(testName: string): SignalKind | undefined {34const signalKindRe = `^\\[(${Object.values(SignalKind).join('|')})\\]`;35const signalKind = testName.match(signalKindRe);36if (signalKind) {37return Object.values(SignalKind).includes(signalKind[1] as SignalKind) ? signalKind[1] as SignalKind : undefined;38}39}40}4142interface TestResult {43/** unflavored */44name: string;45signalKind: SignalKind | undefined;46testResults: BaselineTestResult[];47compScore1: number | undefined;48compScore2: number | undefined;49compScore3: number | undefined;50}5152const regexForProviderName = / \(\[(([a-zA-Z0-9\-])+)\]\)/;53const DEFAULT_PROVIDER_NAME = 'Default Provider';5455function getFlavor(testResult: BaselineTestResult): string {56const match = testResult.name.match(regexForProviderName);57if (match) {58switch (match[1]) {59case 'prodFineTunedModel': return 'NES';60case 'prodFineTunedModelWithSummarizedDocument': return 'NES-summ';61case 'speculativeEditingInlineEditProvider': return 'SpecEdit';62default:63return match[1];64}65} else {66return DEFAULT_PROVIDER_NAME;67}68}6970function computeTestResultsFromBaseline(baseline: BaselineTestResult[]): TestResult[] {7172const nesTestsWithFlavor = baseline.filter((currentBaselineTestResult) =>73currentBaselineTestResult.name.startsWith('NES ') || (currentBaselineTestResult.name.startsWith('InlineEdit') && currentBaselineTestResult.name.includes('])')));7475const fullNameToTestName = (fullName: string) => {76const indexOfSuiteTestNameSplit = fullName.indexOf(' - ');77const testName = fullName.slice(indexOfSuiteTestNameSplit + 3);78if (testName === undefined) { throw new AssertionError({ message: `does not follow the expected pattern: ${fullName}` }); }79return testName;80};8182const testNameToResults = new Map<string, BaselineTestResult[]>();8384for (const nesTest of nesTestsWithFlavor) {85const testName = fullNameToTestName(nesTest.name);86const baselineTestResults = testNameToResults.get(testName) ?? [];87baselineTestResults.push(nesTest);88testNameToResults.set(testName, baselineTestResults);89}9091const sortedTestNameToFlavor = Array.from(testNameToResults.entries());9293sortedTestNameToFlavor.sort((a, b) => {94const aTestName = a[0];95const bTestName = b[0];96return aTestName.localeCompare(bTestName);97});9899return sortedTestNameToFlavor.map(([testName, baselineTestResults]) => {100return {101name: testName,102signalKind: SignalKind.getFromTestName(testName),103testResults: baselineTestResults,104compScore1: baselineTestResults[0]?.attributes?.CompScore1 as number | undefined,105compScore2: baselineTestResults[0]?.attributes?.CompScore2 as number | undefined,106compScore3: baselineTestResults[0]?.attributes?.CompScore3 as number | undefined,107} satisfies TestResult;108});109}110111function formatAsBold(text: string) {112return `${text} *`;113}114115function formatAsColored(text: string, color: 'green' | 'violet' | 'red' | undefined) {116if (!color) {117return text;118}119const colorMap = {120'green': 32,121'red': 31,122'violet': 35,123};124return `\x1b[${colorMap[color]}m${text}\x1b[0m`;125}126127// For BadSuggestion tests, a score > 0 is considered a pass, otherwise a fail128function isBadSuggestionPassed(score: number): boolean {129return score > 0;130}131132// Format pass ratio as a percentage string133function formatPassRatio(passed: number, total: number): string {134if (total === 0) {135return '0.00%';136}137return `${((passed / total) * 100).toFixed(2)}%`;138}139140type TestScoreByFlavor = Record<string /* flavor */, number | { oldScore: number; newScore: number } | undefined>;141type AggregatedTest = { test: string; scores: TestScoreByFlavor; signalKind?: SignalKind };142143function printTable(data: AggregatedTest[], { compare, useColoredOutput, filterProviders, omitEqual }: { compare: boolean; useColoredOutput: boolean; filterProviders?: string[]; omitEqual: boolean }) {144const providers = Array.from(new Set(data.flatMap(d => Object.keys(d.scores))));145const filteredProviders = filterProviders ? providers.filter(provider => filterProviders.includes(provider.toLocaleLowerCase())) : providers;146147const aggregatedTestsBySignalKind = data.reduce((acc: Record<SignalKind, AggregatedTest[]>, item) => {148const group = item.signalKind ?? SignalKind.Other;149if (!acc[group]) {150acc[group] = [];151}152acc[group].push(item);153return acc;154}, {} as Record<SignalKind, AggregatedTest[]>);155156const tableData: Record<string, string>[] = [];157158const totalScoreByProvider: Record<string, number> = {};159const oldTotalScoreByProvider: Record<string, number> = {};160161// Track pass/fail counts for BadSuggestion tests162const badSuggestionPassedByProvider: Record<string, number> = {};163const badSuggestionTotalByProvider: Record<string, number> = {};164const oldBadSuggestionPassedByProvider: Record<string, number> = {};165166for (const provider of filteredProviders) {167totalScoreByProvider[provider] = 0;168oldTotalScoreByProvider[provider] = 0;169badSuggestionPassedByProvider[provider] = 0;170badSuggestionTotalByProvider[provider] = 0;171oldBadSuggestionPassedByProvider[provider] = 0;172}173174// Iterate over each signal kind175for (const [signalKind, tests] of Object.entries(aggregatedTestsBySignalKind)) {176// add header177tableData.push({ 'Test Name': `=== ${signalKind} ===` });178179const totalByProviderForSignalKind: Record<string /* provider */, number> = {};180const oldTotalByProviderForSignalKind: Record<string /* provider */, number> = {};181182// Track pass/fail counts for BadSuggestion tests within this signal kind183const badSuggestionPassedByProviderForSignalKind: Record<string, number> = {};184const badSuggestionTotalByProviderForSignalKind: Record<string, number> = {};185const oldBadSuggestionPassedByProviderForSignalKind: Record<string, number> = {};186187for (const provider of filteredProviders) {188totalByProviderForSignalKind[provider] = 0;189oldTotalByProviderForSignalKind[provider] = 0;190badSuggestionPassedByProviderForSignalKind[provider] = 0;191badSuggestionTotalByProviderForSignalKind[provider] = 0;192oldBadSuggestionPassedByProviderForSignalKind[provider] = 0;193}194195const isBadSuggestionCategory = signalKind === SignalKind.BadSuggestions;196197for (const test of tests) {198const scores = filteredProviders.map(provider => {199const score = test.scores[provider];200const oldScore = typeof score === 'object' ? score.oldScore : undefined;201const numericScore = typeof score === 'object' ? score.newScore : score ?? 0;202203// Handle BadSuggestion scores differently204if (isBadSuggestionCategory) {205badSuggestionTotalByProvider[provider]++;206badSuggestionTotalByProviderForSignalKind[provider]++;207208if (isBadSuggestionPassed(numericScore)) {209badSuggestionPassedByProvider[provider]++;210badSuggestionPassedByProviderForSignalKind[provider]++;211}212213if (oldScore !== undefined) {214if (isBadSuggestionPassed(oldScore)) {215oldBadSuggestionPassedByProvider[provider]++;216oldBadSuggestionPassedByProviderForSignalKind[provider]++;217}218}219} else {220// Regular handling for non-BadSuggestion tests221totalByProviderForSignalKind[provider] += numericScore;222oldTotalScoreByProvider[provider] += oldScore ?? 0;223totalScoreByProvider[provider] += numericScore;224oldTotalByProviderForSignalKind[provider] += oldScore ?? 0;225}226227return numericScore;228});229230const maxScore = Math.max(...scores);231const minScore = Math.min(...scores);232const areAllScoresEqual = maxScore === minScore;233234if (omitEqual && areAllScoresEqual) {235continue;236}237238const resultRow: Record<string, string> = { 'Test Name': test.test };239for (let i = 0; i < filteredProviders.length; i++) {240const provider = filteredProviders[i];241const rawScore = test.scores[provider];242const score = scores[i];243244let formattedScore: string;245246if (isBadSuggestionCategory) {247// For BadSuggestion, show "Pass" or "Fail" instead of score248formattedScore = isBadSuggestionPassed(score) ? 'Pass' : 'Fail';249250if (compare && typeof rawScore === 'object') {251const oldResult = isBadSuggestionPassed(rawScore.oldScore) ? 'Pass' : 'Fail';252const newResult = isBadSuggestionPassed(rawScore.newScore) ? 'Pass' : 'Fail';253254if (oldResult !== newResult) {255const color = useColoredOutput ?256(oldResult === 'Fail' && newResult === 'Pass' ? 'green' : 'red') :257undefined;258formattedScore = formatAsColored(`${oldResult} -> ${newResult}`, color);259}260}261} else {262// Regular formatting for non-BadSuggestion tests263formattedScore = score.toFixed(2);264if (compare && typeof rawScore === 'object' && rawScore.oldScore !== rawScore.newScore) {265const color = useColoredOutput ? (rawScore.newScore > rawScore.oldScore ? 'green' : 'red') : undefined;266formattedScore = formatAsColored(`${rawScore.oldScore.toFixed(2)} -> ${rawScore.newScore.toFixed(2)}`, color);267} else if (maxScore - score < 0.001 && !areAllScoresEqual) {268formattedScore = formatAsBold(formattedScore);269}270}271272resultRow[provider] = typeof rawScore === 'undefined' ? '-' : formattedScore;273}274275tableData.push(resultRow);276}277278// Add subtotal for signal kind279const subtotalRow: Record<string, string> = { 'Test Name': `${signalKind} Subtotal (${tests.length} tests)` };280for (const provider of filteredProviders) {281if (isBadSuggestionCategory) {282// For BadSuggestion, show pass ratio283const passedTests = badSuggestionPassedByProviderForSignalKind[provider];284const totalTests = badSuggestionTotalByProviderForSignalKind[provider];285const passRatio = formatPassRatio(passedTests, totalTests);286287if (compare) {288const oldPassedTests = oldBadSuggestionPassedByProviderForSignalKind[provider];289const oldPassRatio = formatPassRatio(oldPassedTests, totalTests);290291if (oldPassedTests !== passedTests) {292const color = useColoredOutput ? (passedTests > oldPassedTests ? 'green' : 'red') : undefined;293subtotalRow[provider] = formatAsColored(`${oldPassRatio} -> ${passRatio}`, color);294} else {295subtotalRow[provider] = passRatio;296}297} else {298subtotalRow[provider] = passRatio;299}300} else {301// Regular handling for non-BadSuggestion categories302const oldSubTotal = oldTotalByProviderForSignalKind[provider];303const subTotal = totalByProviderForSignalKind[provider];304if (compare && Math.abs(oldSubTotal - subTotal) > 0.001 && !provider.startsWith('Comp')) {305const rawOut = `${oldSubTotal.toFixed(2)} -> ${subTotal.toFixed(2)}`;306const color = useColoredOutput ? (oldSubTotal < subTotal ? 'green' : 'red') : undefined;307subtotalRow[provider] = formatAsColored(rawOut, color);308} else {309subtotalRow[provider] = subTotal.toFixed(2);310}311}312}313tableData.push(subtotalRow, { 'Test Name': '' });314}315316// Add total (don't include BadSuggestion in the grand total)317const totalRow: Record<string, string> = { 'Test Name': 'Grand Total (excluding BadSuggestions)' };318for (const provider of filteredProviders) {319const oldTotal = oldTotalScoreByProvider[provider];320const total = totalScoreByProvider[provider];321if (compare && Math.abs(oldTotal - total) > 0.001 && !provider.startsWith('Comp')) {322const rawOut = `${oldTotal.toFixed(2)} -> ${total.toFixed(2)}`;323const color = useColoredOutput ? (oldTotal < total ? 'green' : 'red') : undefined;324totalRow[provider] = formatAsColored(rawOut, color);325} else {326totalRow[provider] = total.toFixed(2);327}328}329tableData.push(totalRow);330331// Add BadSuggestion aggregate pass ratio332const badSuggestionRow: Record<string, string> = { 'Test Name': 'BadSuggestion Pass Ratio' };333for (const provider of filteredProviders) {334const passedTests = badSuggestionPassedByProvider[provider];335const totalTests = badSuggestionTotalByProvider[provider];336const passRatio = formatPassRatio(passedTests, totalTests);337338if (compare && totalTests > 0) {339const oldPassedTests = oldBadSuggestionPassedByProvider[provider];340const oldPassRatio = formatPassRatio(oldPassedTests, totalTests);341342if (oldPassedTests !== passedTests) {343const color = useColoredOutput ? (passedTests > oldPassedTests ? 'green' : 'red') : undefined;344badSuggestionRow[provider] = formatAsColored(`${oldPassRatio} -> ${passRatio}`, color);345} else {346badSuggestionRow[provider] = passRatio;347}348} else {349badSuggestionRow[provider] = passRatio;350}351}352tableData.push(badSuggestionRow);353354console.table(tableData);355}356357const DEFAULT_BASELINE_JSON_PATH = path.join(__dirname, '../test/simulation/baseline.json');358const DEFAULT_BASELINE_OLD_JSON_PATH = path.join(__dirname, '../test/simulation/baseline.old.json');359360async function main() {361const args = process.argv.slice(2);362const compare = args.includes('--compare');363const upgradeBaselineOldJson = args.includes('--upgrade-old-baseline');364const useColoredOutput = args.includes('--color');365const omitEqual = args.includes('--omit-equal');366const filterArg = args.find(arg => arg.startsWith('--filter='));367const filterProviders = filterArg ? filterArg.split('=')[1].split(',').map(s => s.toLocaleLowerCase()) : undefined;368const externalBaselineArg = args.find(arg => arg.startsWith('--external-baseline='));369const externalBaselinePath = externalBaselineArg ? externalBaselineArg.split('=')[1] : undefined;370371// Determine baseline paths372const BASELINE_JSON_PATH = externalBaselinePath ? path.resolve(externalBaselinePath) : DEFAULT_BASELINE_JSON_PATH;373const BASELINE_OLD_JSON_PATH = path.join(path.dirname(BASELINE_JSON_PATH), 'baseline.old.json');374375let baselineJson: string;376try {377baselineJson = await fs.readFile(BASELINE_JSON_PATH, 'utf8');378} catch (e: unknown) {379console.error('Failed to read baseline.json');380throw e;381}382let baseline: BaselineTestResult[];383try {384baseline = JSON.parse(baselineJson) as BaselineTestResult[];385} catch (e: unknown) {386console.error('Failed to parse baseline.json');387throw e;388}389390if (upgradeBaselineOldJson) {391const baselineJsonContentsFromHEAD = await new Promise<string>((resolve, reject) => {392execFile('git', ['show', `HEAD:${path.relative(process.cwd(), BASELINE_JSON_PATH)}`], (error: Error | null, stdout: string) => {393if (error) {394reject(error);395return;396}397resolve(stdout);398});399});400await fs.writeFile(BASELINE_OLD_JSON_PATH, baselineJsonContentsFromHEAD);401}402403let oldBaseline: BaselineTestResult[] | undefined;404if (compare) {405let oldBaselineJson: string | undefined;406try {407oldBaselineJson = await fs.readFile(BASELINE_OLD_JSON_PATH, 'utf8');408} catch (e: unknown) {409console.error('Failed to read baseline.json');410throw e;411}412try {413oldBaseline = JSON.parse(oldBaselineJson) as BaselineTestResult[];414} catch (e: unknown) {415console.error('Failed to parse baseline.json');416throw e;417}418}419420const testResults = computeTestResultsFromBaseline(baseline);421const oldTestResults = compare && oldBaseline ? computeTestResultsFromBaseline(oldBaseline) : undefined;422423const testNameToOldScoresByFlavor = oldTestResults?.reduce((acc: Record<string /* testName */, Record<string /* flavor */, number | undefined>>, testResult) => {424acc[testResult.name] = testResult.testResults.reduce((acc, testResult) => {425acc[getFlavor(testResult)] = testResult.score;426return acc;427}, { 'Comp1': testResult.compScore1, 'Comp2': testResult.compScore2, 'Comp3': testResult.compScore3 } as Record<string, number | undefined>);428return acc;429}, {}) ?? {};430431const result = testResults.map(testResult => {432const oldScoresByFlavor = testNameToOldScoresByFlavor[testResult.name] || {};433const scores = testResult.testResults.reduce((acc: TestScoreByFlavor, testResult) => {434const flavor = getFlavor(testResult);435const newScore = testResult.score;436const oldScore = oldScoresByFlavor[flavor];437acc[flavor] = oldScore === undefined ? newScore : { oldScore, newScore };438return acc;439}, { 'Comp1': testResult.compScore1, 'Comp2': testResult.compScore2, 'Comp3': testResult.compScore3 });440return {441test: testResult.name,442signalKind: testResult.signalKind,443scores,444};445});446447printTable(result, { compare, useColoredOutput, filterProviders, omitEqual });448}449450main();451452453