Path: blob/main/extensions/copilot/test/simulationMain.ts
13383 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/4// Load env5import * as dotenv from 'dotenv';6dotenv.config();78// Needed for better stack traces as captureLocation parses the stack trace to find stests9import 'source-map-support/register';1011// Load other imports12import * as fs from 'fs';13import minimist from 'minimist';14import { createConnection } from 'net';15import * as path from 'path';16import * as v8 from 'v8';17import type * as vscodeType from 'vscode';18import { SimpleRPC } from '../src/extension/onboardDebug/node/copilotDebugWorker/rpc';19import { ISimulationModelConfig, createExtensionUnitTestingServices } from '../src/extension/test/node/services';20import { CHAT_MODEL } from '../src/platform/configuration/common/configurationService';21import { IEndpointProvider, ModelSupportedEndpoint } from '../src/platform/endpoint/common/endpointProvider';22import { IModelConfig } from '../src/platform/endpoint/test/node/openaiCompatibleEndpoint';23import { fileSystemServiceReadAsJSON } from '../src/platform/filesystem/common/fileSystemService';24import { LogLevel } from '../src/platform/log/common/logService';25import { ParserWithCaching } from '../src/platform/parser/node/parserWithCaching';26import { structureComputer } from '../src/platform/parser/node/structure';27import { NullTelemetryService } from '../src/platform/telemetry/common/nullTelemetryService';28import { TokenizerProvider } from '../src/platform/tokenizer/node/tokenizer';29import { assert } from '../src/util/vs/base/common/assert';30import { Cache } from './base/cache';31import { IChatMLCache } from './base/cachingChatMLFetcher';32import { usedResourceCaches } from './base/cachingResourceFetcher';33import { ChatMLSQLiteCache } from './base/chatMLCache';34import { CompletionsSQLiteCache, ICompletionsCache } from './base/completionsCache';35import { usedEmbeddingsCaches } from './base/embeddingsCache';36import { TestingCacheSalts } from './base/salts';37import { ICompleteBaselineComparison, IModifiedScenario, SimulationBaseline } from './base/simulationBaseline';38import { CacheMode, CurrentTestRunInfo, SimulationServicesOptions, createSimulationChatModelThrottlingTaskLaunchers, loadConfigFile } from './base/simulationContext';39import { ProxiedSimulationEndpointHealth, SimulationEndpointHealthImpl } from './base/simulationEndpointHealth';40import { BASELINE_RUN_COUNT, SimulationOptions } from './base/simulationOptions';41import { ProxiedSimulationOutcome, SimulationOutcomeImpl } from './base/simulationOutcome';42import { drainStdoutAndExit } from './base/stdout';43import { SimulationSuite, SimulationTest, SimulationTestsRegistry, createSimulationTestFilter } from './base/stest';44import { CollectingJSONOutputPrinter, ConsoleJSONOutputPrinter, IJSONOutputPrinter, ProxiedSONOutputPrinter } from './jsonOutputPrinter';45import { green, orange, red, violet, yellow } from './outputColorer';46import { runInputPipeline, runInputPipelineParallel } from './pipeline/pipeline';47import { ITestDiscoveryOptions, discoverTests } from './simulation/externalScenarios';48import { discoverCoffeTests } from './simulation/nesCoffeTests';49import { discoverNesTests } from './simulation/nesExternalTests';50import { OLD_BASELINE_FILENAME, OutputType, PRODUCED_BASELINE_FILENAME, REPORT_FILENAME, RUN_METADATA, SCORECARD_FILENAME, SIMULATION_FOLDER_NAME, generateOutputFolderName } from './simulation/shared/sharedTypes';51import { logger } from './simulationLogger';52import { IInitParams, IInitResult, IRunTestParams, IRunTestResult } from './testExecutionInExtension';53import { GroupedScores, ITestResult, SimulationTestContext, executeTestOnce, executeTests } from './testExecutor';54import { createScoreRenderer, fileExists, printTime } from './util';55const dotSimulationPath = path.join(__dirname, `../${SIMULATION_FOLDER_NAME}`);5657async function main() {58const errors: unknown[] = [];5960process.env['SIMULATION'] = '1';6162process.on('unhandledRejection', (reason, promise) => {63console.error('\n\nUnhandled Rejection at: Promise', promise, 'reason:', reason);64errors.push('unhandled rejection: ' + reason);65});6667try {68if (process.env.VSCODE_SIMULATION_EXTENSION_ENTRY) {69await runInExtensionHost();70} else {71const opts = SimulationOptions.fromProcessArgs();72const result = await run(opts);73if (result) {74errors.push(...result.errors);75}76}77} catch (err) {78errors.push(err?.stack || err?.message || String(err));79}8081if (errors.length > 0) {82console.error(`\n${red('⚠️⚠️⚠️ Command failed with:')}\n\n`);8384for (let i = 0; i < errors.length; i++) {85const idx = `Error${errors.length === 1 ? '' : ` ${i + 1})`} `;86console.error(`\t${idx}${errors[i]}\n\n`);87}88}8990await drainStdoutAndExit(errors.length === 0 ? 0 : 1);91}9293type RunResult = void | { errors: unknown[] };9495async function run(opts: SimulationOptions): Promise<RunResult> {96const jsonOutputPrinter: IJSONOutputPrinter = opts.jsonOutput ? new ConsoleJSONOutputPrinter() : new CollectingJSONOutputPrinter();9798if (opts.externalCacheLayersPath) {99process.env['EXTERNAL_CACHE_LAYERS_PATH'] = opts.externalCacheLayersPath;100}101102switch (true) {103case opts.help && opts.subcommand === 'nes-datagen':104return opts.printTrainHelp();105case opts.help:106return opts.printHelp();107case opts.listModels:108await listChatModels(opts.modelCacheMode === CacheMode.Disable);109return;110case !!opts.nesDatagen:111if (opts.parallelism > 1 && !opts.nesDatagen.workerMode) {112await runInputPipelineParallel(opts);113} else {114await runInputPipeline(opts);115}116return;117case opts.listSuites: // intentional fallthrough118case opts.listTests: {119// stest runner extension runs with both `list-tests` and `list-suites` flags, so they should not be mutually exclusive120const { allSuites } = await loadTests(opts);121122if (opts.listSuites) {123listSuites(allSuites, opts, jsonOutputPrinter);124}125126if (opts.listTests) {127listTests(allSuites, opts, jsonOutputPrinter);128}129130return;131}132default:133return runTests(opts, jsonOutputPrinter);134}135}136137async function runInExtensionHost() {138const nodeOptions = process.env.NODE_OPTIONS;139140// Hook for the js-debug bootloader, which is not automatically executed in the extension host141if (nodeOptions) {142// NODE_OPTIONS is a CLI argument fragment that we need to parse here143const regex = /"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\S+/g;144const parsed = minimist(Array.from(nodeOptions.matchAll(regex), match => {145let arg = match[0];146// Remove surrounding quotes and unescape internal quotes if necessary147if (arg[0] === arg.at(-1) && (arg[0] === '"' || arg[0] === '\'')) {148arg = arg.slice(1, -1).replaceAll(`\\${arg[0]}`, arg[0]);149}150return arg;151}));152153if (parsed.require) {154const reqPaths = Array.isArray(parsed.require) ? parsed.require : [parsed.require];155logger.info(`Loading NODE_OPTIONS require: ${reqPaths.join(', ')}`);156reqPaths.forEach(r => require(r));157}158}159160const port = Number(process.env.VSCODE_SIMULATION_CONTROL_PORT);161const rpc = await new Promise<SimpleRPC>((resolve, reject) => {162const socket = createConnection({ host: '127.0.0.1', port });163socket.on('connect', () => resolve(new SimpleRPC(socket)));164socket.on('error', reject);165});166167const vscode: typeof vscodeType = require('vscode');168const folder = vscode.workspace.workspaceFolders![0];169170Cache.Instance.on('deviceCodeCallback', (url: string) => {171rpc.callMethod('deviceCodeCallback', { url });172});173174rpc.registerMethod('runTest', async (params: IRunTestParams): Promise<IRunTestResult> => {175const { simulationTestContext, tests } = await allTests;176177simulationTestContext.baseline.clear();178simulationTestContext.simulationEndpointHealth.failures.splice(0, simulationTestContext.simulationEndpointHealth.failures.length);179180const test = tests.get(params.testName);181if (!test) {182throw new Error(`Test ${params.testName} not found`);183}184185const result = await executeTestOnce(186simulationTestContext,1871,188params.outcomeDirectory,189test,190params.runNumber,191true,192);193194return { result };195});196197const allTests = rpc.callMethod('init', { folder: folder.uri.fsPath } satisfies IInitParams).then(async (res: IInitResult) => {198const opts = SimulationOptions.fromArray(res.argv);199const { testsToRun } = await loadTests(opts);200const { simulationTestContext } = await prepareTestEnvironment(opts, new ProxiedSONOutputPrinter(rpc), rpc);201return { opts, tests: new Map(testsToRun.map(t => [t.fullName, t])), simulationTestContext };202});203204return new Promise<void>(resolve => {205rpc.registerMethod('close', async () => {206resolve();207});208});209}210211async function prepareTestEnvironment(opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter, rpcInExtensionHost?: SimpleRPC) {212213if (opts.verbose) {214logger.setLogLevel(LogLevel.Trace);215}216217// Configure caching218if (opts.parallelism > 1) {219// To get good cache behavior, we must increase the cache size considerably220ParserWithCaching.CACHE_SIZE_PER_LANGUAGE = Math.max(5, 2 * opts.parallelism);221structureComputer.setCacheSize(Math.max(5, 2 * opts.parallelism));222}223fileSystemServiceReadAsJSON.enable();224225const { allSuites, testsToRun, externalScenariosPath } = await loadTests(opts);226227let outputPath = opts.output;228if (outputPath === undefined) {229outputPath = path.join(dotSimulationPath, generateOutputFolderName());230} else {231// If it's not an absolute path, make it relative to the current working directory232if (!path.isAbsolute(outputPath)) {233outputPath = path.join(process.cwd(), outputPath);234}235}236if (!rpcInExtensionHost) { // don't clean if we're just one participant in a larger run237await clearOrCreateDir(outputPath);238}239240jsonOutputPrinter.print({241type: OutputType.initialTestSummary,242runOutputFolderName: path.basename(outputPath),243testsToRun: testsToRun.map(t => t.fullName),244nRuns: opts.nRuns245});246247const allTests = allSuites.flatMap(cur => cur.tests);248const hasFilteredTests = testsToRun.length !== allTests.length;249250if (!opts.jsonOutput) {251if (hasFilteredTests) {252console.log(`Due to grep filters, will execute ${testsToRun.length} out of ${allTests.length} simulations. Each simulation runs ${opts.nRuns} time(s).\n`);253} else {254console.log(`Will execute ${testsToRun.length} simulations. Each simulation runs ${opts.nRuns} time(s).\n`);255}256}257258259writeHeapSnapshot(opts.heapSnapshots, 'before');260261const canUseBaseline = (opts.nRuns === BASELINE_RUN_COUNT); // only use baseline if running N times262const runningAllTests = (opts.grep === undefined && opts.omitGrep === undefined);263264const baselinePath = opts.externalBaseline265? (266assert(opts.externalScenarios !== undefined, 'externalBaseline must be set only with externalScenarios'),267path.join(opts.externalScenarios, 'baseline.json')268)269: SimulationBaseline.DEFAULT_BASELINE_PATH;270271const baseline = await SimulationBaseline.readFromDisk(baselinePath, runningAllTests);272273if (canUseBaseline) { // copy current baseline as the baseline before the run274await fs.promises.copyFile(baseline.baselinePath, path.join(outputPath, OLD_BASELINE_FILENAME));275}276277const configs = opts.configFile ? loadConfigFile(opts.configFile) : undefined;278279return {280...createSimulationTestContext(opts, runningAllTests, baseline, canUseBaseline, jsonOutputPrinter, outputPath, externalScenariosPath, rpcInExtensionHost, configs),281testsToRun,282baseline,283canUseBaseline,284outputPath,285runningAllTests,286hasFilteredTests,287};288289}290291async function runTests(opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter): Promise<RunResult> {292const errors: unknown[] = [];293294Cache.Instance.on('deviceCodeCallback', (url: string) => {295if (opts.jsonOutput) {296jsonOutputPrinter.print({ type: OutputType.deviceCodeCallback, url });297} else {298console.log(`⚠️ \x1b[31mAuth Required!\x1b[0m Please open the link: ${url}`);299}300});301302const { simulationEndpointHealth, simulationOutcome, simulationTestContext, testsToRun, baseline, canUseBaseline, outputPath, runningAllTests, hasFilteredTests } = await prepareTestEnvironment(opts, jsonOutputPrinter);303304if (opts.gc) {305if (opts.gc && opts.externalCacheLayersPath) {306throw new Error('--gc is currently not compatible with --external-cache-layers-path');307}308Cache.Instance.gcStart();309}310311const totalStartTime = Date.now();312const { testResultsPromises, getGroupedScores } = await executeTests(simulationTestContext, testsToRun);313314console.log('Waiting on test results...');315316const testResults = await Promise.all(testResultsPromises);317318writeHeapSnapshot(opts.heapSnapshots, 'after');319320const totalTime = Date.now() - totalStartTime;321322if (opts.gc) {323Cache.Instance.gcEnd();324}325326for (const result of testResults) {327for (const [idx, o] of result.outcomes.entries()) {328if (o?.kind === 'failed' && o.critical) {329errors.push(`Test failed: ${result.test}, run ${idx}\n` + o.error);330}331}332}333334// this allows to quickly identify which new cache entries were created in this particular simulation run335if (opts.stageCacheEntries && !opts.externalScenarios) {336// TODO@joaomoreno337console.warn('!!! Determining new cache entries is not yet working in Redis, ask Joao to implement it');338}339340const groupedScores = await getGroupedScores();341printOutcome(groupedScores, testsToRun, baseline, opts, canUseBaseline, runningAllTests, testResults, totalTime);342343const tableData = buildScoreTable(groupedScores);344const suiteScoreCard = path.join(outputPath, SCORECARD_FILENAME);345await fs.promises.writeFile(suiteScoreCard, toCsv(tableData));346347if (simulationOutcome instanceof SimulationOutcomeImpl) {348if (!opts.noCachePointer) {349await simulationOutcome.write();350}351352if (!opts.externalScenarios && !hasFilteredTests) {353await simulationOutcome.cleanFolder();354}355}356357if (canUseBaseline) {358await baseline.writeToDisk(path.join(outputPath, PRODUCED_BASELINE_FILENAME));359}360361if (opts.isUpdateBaseline) {362if (canUseBaseline) {363await baseline.writeToDisk();364} else {365errors.push(`Cannot update baseline for ${opts.nRuns} run(s). Please use --n=${BASELINE_RUN_COUNT}.`);366}367}368369await jsonOutputPrinter.flush?.(outputPath);370371const filePath = path.join(outputPath, REPORT_FILENAME);372await fs.promises.writeFile(filePath, JSON.stringify(testResults, null, '\t'));373374if (opts.label) {375const runMetadata = path.join(outputPath, RUN_METADATA);376await fs.promises.writeFile(runMetadata, JSON.stringify({ label: opts.label }, null, '\t'));377}378379// Enable if you want to see which cache entries were used in this simulation run380const writeUsedOtherCaches = false;381if (writeUsedOtherCaches) {382await fs.promises.writeFile('other-caches.json', JSON.stringify(383([] as string[])384.concat(Array.from(usedEmbeddingsCaches))385.concat(Array.from(usedResourceCaches))386));387}388389if (opts.ci && !opts.isUpdateBaseline) {390const changeStats = baseline.compare();391const error = validateChangeStats(changeStats);392if (error) {393errors.push(red(`${error.errorMessage}. Please run 'npm run simulate-update-baseline' and check in baseline.json.`));394}395} else {396if (simulationEndpointHealth.failures.length > 0) {397const rateLimitedCount = simulationEndpointHealth.failures.filter(f => f.request.type === 'rateLimited').length;398const failedCount = simulationEndpointHealth.failures.filter(f => f.request.type === 'failed').length;399400// If there were simulation endpoint failures and we are doing a401// CI baseline update, fail the CI so that we block PR merge402if (opts.ci && opts.isUpdateBaseline) {403errors.push(404red(`Encountered server failures while running simulation: ${rateLimitedCount} rate limited responses, ${failedCount} other failed responses. Please rerun the simulation!`),405...simulationEndpointHealth.failures.map(f => `- ${f.testInfo.testName}: ${f.request.reason}`),406);407}408}409}410411return { errors };412}413414async function loadTests(opts: SimulationOptions) {415let allSuites: readonly SimulationSuite[] = [];416let testsToRun: readonly SimulationTest[] = [];417418let externalScenariosPath = opts.externalScenarios;419if (externalScenariosPath) {420let usageError = false;421if (!opts.inline && !opts.sidebar && !opts.nes) {422usageError = true;423console.error(`Missing --inline or --sidebar or --nes flag`);424}425if ([opts.inline, opts.sidebar, opts.nes].filter(Boolean).length > 1) {426usageError = true;427console.error(`Can only have one of --inline or --sidebar or --nes flags set`);428}429430if (typeof opts.output !== 'string') {431usageError = true;432console.error(`Missing --output flag`);433}434435if (usageError) { // process.exit() if there's a usage error436console.error(`Usage: npm run simulate -- --external-scenarios=<path> --inline --output=<path>`);437console.error(`Usage: npm run simulate -- --external-scenarios=<path> --sidebar --output=<path>`);438await drainStdoutAndExit(1);439}440441// Update paths to be absolute442// If it's not an absolute path, make it relative to the current working directory443if (!path.isAbsolute(externalScenariosPath)) {444externalScenariosPath = path.join(process.cwd(), externalScenariosPath);445}446447if (opts.scenarioTest) {448SimulationTestsRegistry.setInputPath(externalScenariosPath);449} else {450const filter = createSimulationTestFilter(opts.grep, opts.omitGrep);451if (opts.nes) {452if (opts.nes === 'external') {453// run external stests454allSuites = [await discoverNesTests(externalScenariosPath, opts)];455} else {456// run coffe stests457allSuites = [await discoverCoffeTests(externalScenariosPath, opts)];458}459} else {460const testDiscoveryOptions: ITestDiscoveryOptions = {461chatKind: (opts.inline && !opts.sidebar) ? 'inline' : 'panel',462applyChatCodeBlocks: opts.applyChatCodeBlocks,463};464allSuites = await discoverTests(externalScenariosPath, testDiscoveryOptions);465}466testsToRun = allSuites467.flatMap(suite => suite.tests)468.filter(filter)469.sort((t0, t1) => t0.fullName.localeCompare(t1.fullName));470}471}472473if (testsToRun.length === 0) {474SimulationTestsRegistry.setFilters(opts.scenarioTest, opts.grep, opts.omitGrep);475await import('./simulationTests');476allSuites = SimulationTestsRegistry.getAllSuites();477testsToRun = SimulationTestsRegistry.getAllTests();478}479return { allSuites, testsToRun, externalScenariosPath };480}481482function listSuites(allSuites: readonly SimulationSuite[], opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter) {483for (const suite of allSuites) {484jsonOutputPrinter.print({ type: OutputType.detectedSuite, name: suite.fullName, location: suite.options.location });485}486}487488function listTests(allSuites: readonly SimulationSuite[], opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter) {489// we should just list all tests490const allTests = allSuites.flatMap(suite => suite.tests);491for (const test of allTests) {492jsonOutputPrinter.print({ type: OutputType.detectedTest, suiteName: test.suite.fullName, name: test.fullName, location: test.options.location });493if (!opts.jsonOutput) {494console.log(` - ${test.fullName}`);495}496}497}498499async function listChatModels(skipCache: boolean = false) {500const accessor = createExtensionUnitTestingServices(undefined, undefined, { skipModelMetadataCache: skipCache }).createTestingAccessor();501const endpointProvider = accessor.get(IEndpointProvider);502const chatEndpoints = await endpointProvider.getAllChatEndpoints();503console.log('Available Chat Models:\n');504505// Group models by family506const modelsByFamily = new Map<string, string[]>();507508for (const endpoint of chatEndpoints) {509const family = endpoint.family || 'Other'; // Default family name if not specified510if (!modelsByFamily.has(family)) {511modelsByFamily.set(family, []);512}513modelsByFamily.get(family)!.push(endpoint.model);514}515516// Print each family with its models517const tableData: { Family: string; Models: string }[] = [];518519// Convert to array and sort by family name for consistent display520const sortedFamilies = Array.from(modelsByFamily.entries()).sort((a, b) => a[0].localeCompare(b[0]));521522for (const [family, models] of sortedFamilies) {523// Sort models within each family524models.sort();525tableData.push({526Family: family,527Models: models.join(', ')528});529}530531console.table(tableData);532return;533}534535function createSimulationTestContext(536opts: SimulationOptions,537runningAllTests: boolean,538baseline: SimulationBaseline,539canUseBaseline: boolean,540jsonOutputPrinter: IJSONOutputPrinter,541outputPath: string,542externalScenariosPath: string | undefined,543rpcInExtensionHost: SimpleRPC | undefined,544configs: Record<string, unknown> | undefined,545) {546const simulationEndpointHealth = rpcInExtensionHost ? new ProxiedSimulationEndpointHealth(rpcInExtensionHost) : new SimulationEndpointHealthImpl();547548let createChatMLCache: ((info: CurrentTestRunInfo) => IChatMLCache) | undefined;549let createNesFetchCache: ((info: CurrentTestRunInfo) => ICompletionsCache) | undefined;550551if (opts.lmCacheMode === CacheMode.Disable) {552console.warn('❗ Not using any cache');553createChatMLCache = undefined;554createNesFetchCache = undefined;555} else {556createChatMLCache = (info: CurrentTestRunInfo) => new ChatMLSQLiteCache(TestingCacheSalts.requestCacheSalt, info);557createNesFetchCache = (info: CurrentTestRunInfo) => new CompletionsSQLiteCache(TestingCacheSalts.nesFetchCacheSalt, info);558}559560const simulationServicesOptions: SimulationServicesOptions = {561createChatMLCache,562createNesFetchCache,563chatModelThrottlingTaskLaunchers: createSimulationChatModelThrottlingTaskLaunchers(opts.boost),564isNoFetchModeEnabled: opts.noFetch,565languageModelCacheMode: opts.lmCacheMode,566resourcesCacheMode: opts.resourcesCacheMode,567disabledTools: opts.disabledTools,568summarizeHistory: opts.summarizeHistory,569swebenchPrompt: opts.swebenchPrompt,570useExperimentalCodeSearchService: opts.useExperimentalCodeSearchService,571configs572};573574const customModelConfigMap: Map<string, IModelConfig> = new Map();575if (opts.modelConfigFile) {576console.log('Using model configuration file: ' + opts.modelConfigFile);577const customModelConfigs = parseModelConfigFile(opts.modelConfigFile);578customModelConfigs.forEach(config => {579customModelConfigMap.set(config.id, config);580});581}582583const modelConfig: ISimulationModelConfig = {584chatModel: opts.chatModel,585fastChatModel: opts.fastChatModel,586smartChatModel: opts.smartChatModel,587embeddingType: opts.embeddingType,588fastRewriteModel: opts.fastRewriteModel,589skipModelMetadataCache: opts.modelCacheMode === CacheMode.Disable,590customModelConfigs: customModelConfigMap,591};592593594const simulationOutcome = rpcInExtensionHost ? new ProxiedSimulationOutcome(rpcInExtensionHost) : new SimulationOutcomeImpl(runningAllTests);595596const simulationTestContext: SimulationTestContext = {597opts,598baseline,599canUseBaseline,600jsonOutputPrinter,601outputPath,602externalScenariosPath,603modelConfig,604simulationServicesOptions,605simulationOutcome,606simulationEndpointHealth,607tokenizerProvider: new TokenizerProvider(false, new NullTelemetryService()) // this is expensive so we share it across all stests608};609return { simulationTestContext, simulationEndpointHealth, simulationOutcome };610}611612function printOutcome(613groupedScores: GroupedScores,614testsToRun: readonly SimulationTest[],615baseline: SimulationBaseline,616opts: SimulationOptions,617canUseBaseline: boolean,618runningAllTests: boolean,619testResults: ITestResult[],620totalTime: number621): void {622const shouldShowSummaries = (testsToRun.length >= 10); // only when running at least 10 tests623const shouldBeBrief = (testsToRun.length === 1); // when running a single test, be brief624625if (shouldShowSummaries) {626const modelComparisonTable = [];627for (const [suiteName, scoresPerSuite] of groupedScores.entries()) {628const testScores = new Map<string, { count: number; scoreSum: number }>();629for (const [_language, scoresPerLanguage] of scoresPerSuite.entries()) {630for (const [model, scoresPerModel] of scoresPerLanguage.entries()) {631if (!model) {632continue;633}634const data = testScores.get(model) || { count: 0, scoreSum: 0 };635data.count += scoresPerModel.length;636data.scoreSum += scoresPerModel.reduce((acc, curr) => acc + curr, 0);637testScores.set(model, data);638}639}640let modelCount = 0;641modelCount += (testScores.has(CHAT_MODEL.GPT41) ? 1 : 0);642modelCount += (testScores.has(CHAT_MODEL.GPT4OMINI) ? 1 : 0);643if (modelCount > 1) {644const gpt4o = testScores.get(CHAT_MODEL.GPT41) ?? { count: 0, scoreSum: 0 };645const gpt4oMini = testScores.get(CHAT_MODEL.GPT4OMINI) ?? { count: 0, scoreSum: 0 };646const row = {647Suite: suiteName,648'# of tests': (gpt4o.count === 0 || gpt4oMini.count === 0) ? gpt4o.count || gpt4oMini.count : `${gpt4o.count} <> ${gpt4oMini.count}`, 'GPT-4o': gpt4o.count ? Number(gpt4o.scoreSum / gpt4o.count * 100).toFixed(2) : '-',649'GPT-4o-mini': gpt4oMini.count ? Number(gpt4oMini.scoreSum / gpt4oMini.count * 100).toFixed(2) : '-',650};651652modelComparisonTable.push(row);653}654}655if (modelComparisonTable.length !== 0) {656console.log(`\n${yellow('Suite Summary by Model:')}`);657console.table(modelComparisonTable);658}659660console.log(`\n${yellow('Suite Summary by Language:')}`);661const tableData = buildScoreTable(groupedScores);662console.table(tableData);663}664665const changeStats = baseline.compare();666const scoreToString = createScoreRenderer(opts, canUseBaseline);667const printChanged = (changedScenarios: IModifiedScenario[]) => {668for (const scenario of changedScenarios) {669const prettyScore = `${scoreToString(scenario.prevScore)} -> ${scoreToString(scenario.currScore)}`;670const color = scenario.currScore > scenario.prevScore ? green : red;671console.log(` - [${color(prettyScore)}] ${scenario.name}`);672}673};674if (canUseBaseline) {675console.log(`\nSummary:`);676if (!shouldBeBrief && !runningAllTests) {677console.log(` Tests Score: ${baseline.currentScore.toFixed(2)}%`);678}679if (!shouldBeBrief) {680console.log(`Overall Score: ${baseline.overallScore.toFixed(2)}%`);681}682if (changeStats.nImproved > 0) {683console.log(`${green('▲')} - Score improved in ${changeStats.nImproved} scenarios`);684}685if (changeStats.nWorsened > 0) {686console.log(`${red('▼')} - Score decreased in ${changeStats.nWorsened} scenarios`);687}688} else {689if (!shouldBeBrief) {690console.log(`\n${yellow(`Approximate Summary (due to using --n=${opts.nRuns} instead of --n=${BASELINE_RUN_COUNT}):`)}`);691const score = testResults.reduce((prev, curr) => prev + curr.score, 0);692console.log(`Overall Approximate Score: ${(score / testsToRun.length * 100).toFixed(2)} / 100`);693}694if (changeStats.nImproved > 0) {695console.log(`${green('▲')} - Score clearly improved in ${changeStats.nImproved} scenarios`);696}697if (changeStats.nWorsened > 0) {698console.log(`${red('▼')} - Score clearly decreased in ${changeStats.nWorsened} scenarios`);699}700}701if (changeStats.nUnchanged > 0) {702console.log(`= - Score unchanged in ${changeStats.nUnchanged} scenarios`);703}704if (changeStats.addedScenarios > 0) {705console.log(`${violet('◆')} - New scenarios count - ${changeStats.addedScenarios}`);706}707if (changeStats.removedScenarios > 0) {708console.log(`${orange('●')} - Missing ${changeStats.removedScenarios} scenarios.`);709}710if (changeStats.skippedScenarios > 0) {711console.log(`${yellow('●')} - Skipped ${changeStats.skippedScenarios} scenarios.`);712}713714if (changeStats.improvedScenarios.length > 0 || changeStats.worsenedScenarios.length > 0) {715console.log();716}717if (changeStats.improvedScenarios.length > 0) {718console.log(`${green('Improved')}:`);719printChanged(changeStats.improvedScenarios);720}721if (changeStats.worsenedScenarios.length > 0) {722console.log(`${red('Worsened')}:`);723printChanged(changeStats.worsenedScenarios);724}725726console.log(`\n Simulation finished(${printTime(totalTime)}) \n`);727}728729function buildScoreTable(groupedScores: GroupedScores): object[] {730const tableData: object[] = [];731for (const [suiteName, scoresPerSuite] of groupedScores.entries()) {732for (const [language, scoresPerLanguage] of scoresPerSuite.entries()) {733for (const [model, scoresPerModel] of scoresPerLanguage.entries()) {734const row = {735Suite: suiteName,736Language: language ?? '-',737Model: model ?? '-',738'# of tests': scoresPerModel.length,739'Score(%)': Number((scoresPerModel.reduce((acc, curr) => acc + curr, 0) / scoresPerModel.length * 100).toFixed(2)),740};741tableData.push(row);742}743}744}745return tableData;746}747748function validateChangeStats(changeStats: ICompleteBaselineComparison): { errorMessage: string } | undefined {749if (changeStats.nWorsened > 0) {750// if any worsened, fail751return { errorMessage: 'Some scenarios have worsened' };752}753if (changeStats.nImproved > 0) {754// if any improved, fail755return { errorMessage: 'Some scenarios have improved' };756}757if (changeStats.addedScenarios > 0) {758// if any added, fail759return { errorMessage: 'New scenarios detected' };760}761if (changeStats.removedScenarios > 0) {762// if any removed, fail763return { errorMessage: 'Some scenarios were removed' };764}765if (changeStats.mandatory.skippedScenarios > 0) {766// only fail if mandatory scenarios are skipped767return { errorMessage: 'Some mandatory scenarios were skipped' };768}769return undefined;770}771772function writeHeapSnapshot(snapshotFilename: boolean | string | undefined, label: 'before' | 'after') {773if (snapshotFilename === undefined || snapshotFilename === false) {774return;775}776777const fileName = typeof snapshotFilename === 'string' ? `${snapshotFilename}-${label}.heapsnapshot` : undefined;778console.log(`Writing heap snapshot: ${v8.writeHeapSnapshot(fileName)}`);779}780781async function clearOrCreateDir(path: string) {782if (await fileExists(path)) {783await fs.promises.rm(path, { recursive: true, force: true });784}785await fs.promises.mkdir(path, { recursive: true });786}787788function toCsv(rows: object[]): string {789if (rows.length === 0) { return ''; }790791const header = Object.keys(rows[0]).join(',') + '\n';792const rowsStr = rows.map(obj => Object.values(obj).join(',') + '\n').join('');793794return header + rowsStr;795}796797function parseModelConfigFile(modelConfigFilePath: string): IModelConfig[] {798const resolvedModelConfigFilePath = path.isAbsolute(modelConfigFilePath) ? modelConfigFilePath : path.join(process.cwd(), modelConfigFilePath);799const configFileContents = fs.readFileSync(resolvedModelConfigFilePath, 'utf-8');800801let modelConfig: any;802try {803modelConfig = JSON.parse(configFileContents);804} catch (error) {805throw new Error(`Invalid JSON configuration file ${resolvedModelConfigFilePath}: ${error.message}`);806}807808if (!modelConfig || typeof modelConfig !== 'object') {809throw new Error('Invalid configuration file ' + resolvedModelConfigFilePath);810}811812/**813* the modelConfigFile.json should contain objects of the form:814```815"<model id>": {816"name": "<model name>",817"version": "<model version>",818"type": "<model type>", // 'openai' or 'azureOpenai'819"useDeveloperRole": <boolean>, // optional, defaults to false820"url": "<endpoint URL>",821"capabilities"?: {822"supports"?: {823"parallel_tool_calls"?: <boolean>,824"streaming"?: <boolean>,825"tool_calls"?: <boolean>,826"vision"?: <boolean>,827"prediction"?: <boolean>828},829"limits"?: {830"max_prompt_tokens"?: <number>,831"max_output_tokens"?: <number>,832"max_context_window_tokens"?: <number>833}834},835"auth?": {836"useBearerHeader"?: <boolean>, // Use Bearer token for authentication. Defaults to false837"useApiKeyHeader"?: <boolean>, // Use API key for authentication. Defaults to false838"apiKeyEnvName": "<environment variable name for API key to be used for the above headers>"839},840"overrides"?: {841"requestHeaders"?: { "<header name>": "<header value>" }, // optional, custom request headers842"temperature"?: <number> | null, // optional, if null removes from request body843"top_p"?: <number> | null, // optional, if null removes from request body844"snippy"?: <boolean> | null, // optional, if null removes from request body845"max_tokens"?: <number> | null, // optional, if null removes from request body846"max_completion_tokens"?: <number> | null, // optional, if null removes from request body847"intent"?: <boolean> | null // optional, if null removes from request body848}849},850...851```852*/853854const checkProperty = (obj: any, prop: string, type: 'string' | 'boolean' | 'number' | 'object', optional?: boolean, nullable?: boolean) => {855if (!(prop in obj)) {856if (optional) {857return;858}859throw new Error(`Missing property '${prop}' in model configuration file ${resolvedModelConfigFilePath}`);860}861862if (nullable && obj[prop] === null) {863return;864}865866if (typeof obj[prop] !== type) {867throw new Error(`Property '${prop}' in model configuration file ${resolvedModelConfigFilePath} must be of type '${type}', but got '${typeof obj[prop]}'`);868}869};870871const modelConfigs: IModelConfig[] = [];872for (const modelId in modelConfig) {873const model = modelConfig[modelId];874if (typeof model !== 'object') {875throw new Error(`Model configuration for '${modelId}' must be an object`);876}877checkProperty(model, 'name', 'string');878checkProperty(model, 'version', 'string');879checkProperty(model, 'type', 'string');880if (model.type !== 'openai' && model.type !== 'azureOpenai') {881throw new Error(`Model type '${model.type}' is not supported. Only 'openai' and 'azureOpenai' are allowed.`);882}883checkProperty(model, 'useDeveloperRole', 'boolean', true);884checkProperty(model, 'url', 'string');885886checkProperty(model, 'capabilities', 'object', true);887checkProperty(model.capabilities, 'supports', 'object', true);888if (model.capabilities?.supports) {889checkProperty(model.capabilities.supports, 'parallel_tool_calls', 'boolean', true);890checkProperty(model.capabilities.supports, 'streaming', 'boolean', true);891checkProperty(model.capabilities.supports, 'tool_calls', 'boolean', true);892checkProperty(model.capabilities.supports, 'vision', 'boolean', true);893checkProperty(model.capabilities.supports, 'prediction', 'boolean', true);894checkProperty(model.capabilities.supports, 'thinking', 'boolean', true);895}896897checkProperty(model.capabilities, 'limits', 'object', true);898if (model.capabilities?.limits) {899checkProperty(model.capabilities.limits, 'max_prompt_tokens', 'number', true);900checkProperty(model.capabilities.limits, 'max_output_tokens', 'number', true);901checkProperty(model.capabilities.limits, 'max_context_window_tokens', 'number', true);902}903904checkProperty(model, 'auth', 'object', true);905if (model.auth) {906checkProperty(model.auth, 'useBearerHeader', 'boolean', true);907checkProperty(model.auth, 'useApiKeyHeader', 'boolean', true);908checkProperty(model.auth, 'apiKeyEnvName', 'string');909}910911checkProperty(model, 'overrides', 'object', true);912if (model.overrides) {913const overrides = model.overrides;914checkProperty(overrides, 'requestHeaders', 'object', true, true);915checkProperty(overrides, 'temperature', 'number', true, true);916checkProperty(overrides, 'top_p', 'number', true, true);917checkProperty(overrides, 'snippy', 'boolean', true, true);918checkProperty(overrides, 'intent', 'boolean', true, true);919checkProperty(overrides, 'max_tokens', 'number', true, true);920checkProperty(overrides, 'max_completion_tokens', 'number', true, true);921}922923// Validate supported_endpoints924if (model.supported_endpoints) {925if (!Array.isArray(model.supported_endpoints)) {926throw new Error(`Property 'supported_endpoints' in model configuration file ${resolvedModelConfigFilePath} must be an array`);927}928for (const endpointSuffix of model.supported_endpoints) {929if (!Object.values(ModelSupportedEndpoint).includes(endpointSuffix as ModelSupportedEndpoint)) {930throw new Error(`Invalid endpoint suffix '${endpointSuffix}' in supported_endpoints for model '${modelId}'. Must be one of: ${Object.values(ModelSupportedEndpoint).join(', ')}`);931}932}933}934935modelConfigs.push({936id: modelId,937name: model.name,938version: model.version,939type: model.type,940useDeveloperRole: model.useDeveloperRole ?? false,941url: model.url,942capabilities: {943supports: {944parallel_tool_calls: model.capabilities?.supports?.parallel_tool_calls ?? false,945streaming: model.capabilities?.supports?.streaming ?? false,946tool_calls: model.capabilities?.supports?.tool_calls ?? false,947vision: model.capabilities?.supports?.vision ?? false,948prediction: model.capabilities?.supports?.prediction ?? false,949thinking: model.capabilities?.supports?.thinking ?? false950},951limits: {952max_prompt_tokens: model.capabilities?.limits?.max_prompt_tokens ?? 128000,953max_output_tokens: model.capabilities?.limits?.max_output_tokens ?? Number.MAX_SAFE_INTEGER,954max_context_window_tokens: model.capabilities?.limits?.max_context_window_tokens955}956},957supported_endpoints: model.supported_endpoints?.length ? model.supported_endpoints as ModelSupportedEndpoint[] : [ModelSupportedEndpoint.ChatCompletions],958auth: {959useBearerHeader: model.auth?.useBearerHeader ?? false,960useApiKeyHeader: model.auth?.useApiKeyHeader ?? false,961apiKeyEnvName: model.auth?.apiKeyEnvName962},963overrides: {964requestHeaders: model.overrides?.hasOwnProperty('requestHeaders') ? model.overrides.requestHeaders : {},965temperature: model.overrides?.hasOwnProperty('temperature') ? model.overrides.temperature : undefined,966top_p: model.overrides?.hasOwnProperty('top_p') ? model.overrides.top_p : undefined,967snippy: model.overrides?.hasOwnProperty('snippy') ? model.overrides.snippy : undefined,968intent: model.overrides?.hasOwnProperty('intent') ? model.overrides.intent : undefined,969max_tokens: model.overrides?.hasOwnProperty('max_tokens') ? model.overrides.max_tokens : undefined,970max_completion_tokens: model.overrides?.hasOwnProperty('max_completion_tokens') ? model.overrides.max_completion_tokens : undefined,971}972});973}974975return modelConfigs;976}977978(async () => main())();979980981