Path: blob/main/extensions/copilot/test/testExecutor.ts
13383 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/4import * as path from 'path';5import { IPromptWorkspaceLabels, PromptWorkspaceLabels } from '../src/extension/context/node/resolvers/promptWorkspaceLabels';6import { INewWorkspacePreviewContentManager, NewWorkspacePreviewContentManagerImpl } from '../src/extension/intents/node/newIntent';7import { IntentError } from '../src/extension/prompt/node/intents';8import { ISimulationModelConfig } from '../src/extension/test/node/services';9import { IToolsService } from '../src/extension/tools/common/toolsService';10import { TestToolsService } from '../src/extension/tools/node/test/testToolsService';11import { IEndpointProvider } from '../src/platform/endpoint/common/endpointProvider';12import { TestEndpointProvider } from '../src/platform/endpoint/test/node/testEndpointProvider';13import { ConsoleLog, ILogService, LogServiceImpl } from '../src/platform/log/common/logService';14import { APIUsage } from '../src/platform/networking/common/openai';15import { ISimulationTestContext } from '../src/platform/simulationTestContext/common/simulationTestContext';16import { ITasksService } from '../src/platform/tasks/common/tasksService';17import { TestTasksService } from '../src/platform/tasks/common/testTasksService';18import { TestingServiceCollection } from '../src/platform/test/node/services';19import { ITokenizerProvider } from '../src/platform/tokenizer/node/tokenizer';20import { count } from '../src/util/common/arrays';21import { WellKnownLanguageId } from '../src/util/common/languages';22import { groupBy } from '../src/util/vs/base/common/collections';23import { BugIndicatingError } from '../src/util/vs/base/common/errors';24import { Lazy } from '../src/util/vs/base/common/lazy';25import { safeStringify } from '../src/util/vs/base/common/objects';26import { SyncDescriptor } from '../src/util/vs/platform/instantiation/common/descriptors';27import { SimulationExtHostToolsService } from './base/extHostContext/simulationExtHostToolsService';28import { SimulationBaseline, TestBaselineComparison } from './base/simulationBaseline';29import { CacheMode, createSimulationAccessor, CurrentTestRunInfo, SimulationServicesOptions } from './base/simulationContext';30import { ISimulationEndpointHealth } from './base/simulationEndpointHealth';31import { SimulationOptions } from './base/simulationOptions';32import { ISimulationOutcome } from './base/simulationOutcome';33import { FetchRequestCollector } from './base/spyingChatMLFetcher';34import { ISimulationTestRuntime, SimulationTest, SimulationTestRuntime, toDirname } from './base/stest';35import { IJSONOutputPrinter } from './jsonOutputPrinter';36import { green, red, violet, yellow } from './outputColorer';37import { ExternalSimulationTestRuntime } from './simulation/externalScenarios';38import * as shared from './simulation/shared/sharedTypes';39import { ITestSnapshots, TestSnapshotsImpl } from './simulation/testSnapshot';40import { TaskRunner } from './taskRunner';41import { TestExecutionInExtension } from './testExecutionInExtension';42import { createScoreRenderer, printTime } from './util';4344/**45* Represents outcome of N runs of a scenario.46*/47export interface ITestResult {48test: string;49outcomeDirectory: string;50conversationPath?: string;51score: number;52usage: APIUsage;53// FIXME@ulugbekna: specify when the outcome is undefined54outcomes: (shared.SimulationTestOutcome | undefined)[];55duration: number;56cacheInfo: TestRunCacheInfo[];57originalResults: ITestRunResult[];58}5960interface ITestRunResultCommon {61contentFilterCount: number;62usage: APIUsage;63cacheInfo: TestRunCacheInfo;64hasCacheMiss: boolean;65}6667interface ITestRunResultPass extends ITestRunResultCommon {68kind: 'pass';69explicitScore: number | undefined;70duration: number;71outcome: shared.SimulationTestOutcome | undefined;72}7374interface ITestRunResultFail extends ITestRunResultCommon {75kind: 'fail';76message: string;77duration: number;78outcome: shared.SimulationTestOutcome;79}8081/**82* Represents outcome of a single run of a scenario.83*/84export type ITestRunResult = ITestRunResultPass | ITestRunResultFail;8586export type CacheInfo = { type: 'request'; key: string }; // TODO: add other caches here8788export type TestRunCacheInfo = CacheInfo[];8990export interface SimulationTestContext {91opts: SimulationOptions;92baseline: SimulationBaseline;93canUseBaseline: boolean;94jsonOutputPrinter: IJSONOutputPrinter;95outputPath: string;96externalScenariosPath?: string;97modelConfig: ISimulationModelConfig;98simulationEndpointHealth: ISimulationEndpointHealth;99simulationServicesOptions: SimulationServicesOptions;100simulationOutcome: ISimulationOutcome;101tokenizerProvider: ITokenizerProvider;102}103104export type GroupedScores = Map<string, Map<WellKnownLanguageId | undefined, Map<string | undefined, number[]>>>;105106function mergeGroupedScopes(into: GroupedScores, from: GroupedScores) {107for (const [key, value] of from) {108const intoValue = into.get(key);109if (!intoValue) {110into.set(key, value);111continue;112}113114for (const [language, scores] of value) {115const intoScores = intoValue.get(language);116if (intoScores) {117for (const [model, score] of scores) {118if (intoScores.has(model)) {119intoScores.set(model, [...intoScores.get(model)!, ...score]);120} else {121intoScores.set(model, score);122}123}124} else {125intoValue.set(language, scores);126}127}128}129}130131export type ExecuteTestResult = {132testResultsPromises: Promise<ITestResult>[];133getGroupedScores(): Promise<GroupedScores>;134};135136export async function executeTests(ctx: SimulationTestContext, testsToRun: readonly SimulationTest[]): Promise<ExecuteTestResult> {137const location = groupBy(testsToRun as SimulationTest[], test => (test.suite.extHost ?? ctx.opts.inExtensionHost) ? 'extHost' : 'local');138139const extensionRunner = new Lazy(() => TestExecutionInExtension.create(ctx));140const [extHost, local] = await Promise.all([141executeTestsUsing(ctx, location['extHost'] ?? [], (...args) => extensionRunner.value.then(e => e.executeTest(...args))),142executeTestsUsing(ctx, location['local'] ?? [], executeTestOnce),143]);144145return {146testResultsPromises: [...extHost.testResultsPromises, ...local.testResultsPromises],147getGroupedScores: async () => {148const [fromExtHost, fromLocal] = await Promise.all([extHost.getGroupedScores(), local.getGroupedScores()]);149await extensionRunner.rawValue?.then(r => r.dispose());150mergeGroupedScopes(fromLocal, fromExtHost);151return fromLocal;152},153};154}155156async function executeTestsUsing(ctx: SimulationTestContext, testsToRun: readonly SimulationTest[], executeTestFn: ExecuteTestOnceFn): Promise<ExecuteTestResult> {157const { opts, jsonOutputPrinter } = ctx;158const groupedScores: Map<string, Map<WellKnownLanguageId | undefined, Map<string | undefined, number[]>>> = new Map();159160const taskRunner = new TaskRunner(opts.parallelism);161162const testResultsPromises: Promise<ITestResult>[] = [];163for (const test of testsToRun) {164165if (test.options.optional && (test.options.skip(ctx.opts) || opts.ci)) { // CI never runs optional stests166// Avoid spamming the console, we now have very many skipped stests167// console.log(` Skipping ${test.fullName}`);168ctx.baseline.setSkippedTest(test.fullName);169jsonOutputPrinter.print({ type: shared.OutputType.skippedTest, name: test.fullName });170continue;171}172173const testRun = executeTestNTimes(ctx, taskRunner, test, groupedScores, executeTestFn);174175testResultsPromises.push(testRun);176177if (opts.parallelism === 1) {178await testRun;179}180}181182return {183testResultsPromises,184getGroupedScores: async () => {185await Promise.all(testResultsPromises);186return groupedScores;187},188};189}190191/** Runs a single scenario `nRuns` times. */192async function executeTestNTimes(193ctx: SimulationTestContext,194taskRunner: TaskRunner,195test: SimulationTest,196groupedScores: Map<string, Map<WellKnownLanguageId | undefined, Map<string | undefined, number[]>>>,197executeTestFn: ExecuteTestOnceFn198): Promise<ITestResult> {199200const { opts } = ctx;201202const outcomeDirectory = path.join(ctx.outputPath, toDirname(test.fullName));203204const testStartTime = Date.now();205206const scheduledTestRuns: Promise<ITestRunResult>[] = [];207for (let kthRun = 0; kthRun < opts.nRuns; kthRun++) {208scheduledTestRuns.push(taskRunner.run(() => executeTestFn(ctx, taskRunner.parallelism, outcomeDirectory, test, kthRun)));209}210211const runResults: ITestRunResult[] = await Promise.all(scheduledTestRuns);212213const testElapsedTime = Date.now() - testStartTime;214215const testSummary = {216results: runResults,217hasCacheMisses: runResults.some(x => x.hasCacheMiss),218contentFilterCount: runResults.filter(x => x.contentFilterCount > 0).length,219};220221if (!opts.externalScenarios) {222await ctx.simulationOutcome.set(test, testSummary.results);223}224225const testResultToScore = (result: ITestRunResult) => result.kind === 'pass' ? (result.explicitScore ?? 1) : 0;226227const scoreTotal = Math.round(testSummary.results.reduce((total, result) => total + testResultToScore(result), 0) * 1000) / 1000;228229const currentScore = scoreTotal / testSummary.results.length;230231const currentPassCount = count(testSummary.results, s => s.kind === 'pass');232233const baselineComparison = ctx.baseline.setCurrentResult({234name: test.fullName,235optional: test.options.optional ? true : undefined,236contentFilterCount: testSummary.contentFilterCount,237passCount: currentPassCount,238failCount: testSummary.results.length - currentPassCount,239score: currentScore,240attributes: test.attributes241});242243printTestRunResultsToCli({ testSummary, ctx, test, currentScore, testElapsedTime, baselineComparison, });244245if (opts.verbose !== undefined) {246printVerbose(opts, testSummary);247}248249updateGroupedScores({ test, currentScore, groupedScores });250251const duration = testSummary.results.reduce((acc, c) => acc + c.duration, 0);252253const initial: APIUsage = { completion_tokens: 0, prompt_tokens: 0, total_tokens: 0, prompt_tokens_details: { cached_tokens: 0 } };254const usage: APIUsage = testSummary.results.reduce((acc, c): APIUsage => {255if (c.usage === undefined) { return acc; }256const { completion_tokens, prompt_tokens, total_tokens, prompt_tokens_details } = c.usage;257return {258completion_tokens: acc.completion_tokens + completion_tokens,259prompt_tokens: acc.prompt_tokens + prompt_tokens,260total_tokens: acc.total_tokens + total_tokens,261prompt_tokens_details: {262cached_tokens: (acc.prompt_tokens_details?.cached_tokens ?? 0) + (prompt_tokens_details?.cached_tokens ?? 0),263}264} satisfies APIUsage;265}, initial);266267return {268test: test.fullName,269outcomeDirectory: path.relative(ctx.outputPath, outcomeDirectory),270conversationPath: test.options.conversationPath,271score: currentScore,272duration,273usage,274outcomes: testSummary.results.map(r => r.outcome),275cacheInfo: testSummary.results.map(r => r.cacheInfo),276originalResults: testSummary.results,277};278}279280function printTestRunResultsToCli({ testSummary, ctx, test, currentScore, testElapsedTime, baselineComparison }: {281testSummary: {282contentFilterCount: number;283results: ITestRunResult[];284hasCacheMisses: boolean;285};286ctx: SimulationTestContext;287test: SimulationTest;288currentScore: number;289testElapsedTime: number;290baselineComparison: TestBaselineComparison;291}) {292293const scoreToString = createScoreRenderer(ctx.opts, ctx.canUseBaseline);294const didScoreChange = !baselineComparison.isNew && baselineComparison.prevScore !== baselineComparison.currScore;295const prettyScoreValue = didScoreChange296? `${scoreToString(baselineComparison.prevScore)} -> ${scoreToString(baselineComparison.currScore)}`297: `${scoreToString(currentScore)}`;298299let icon = '=';300let color = (x: string | number) => x;301if (baselineComparison.isNew) {302icon = '◆';303color = violet;304} else if (baselineComparison.isImproved) {305icon = '▲';306color = green;307} else if (baselineComparison.isWorsened) {308icon = '▼';309color = red;310}311312const prettyTestTime = ctx.opts.parallelism === 1 ? ` (${(testElapsedTime > 10 ? yellow(printTime(testElapsedTime)) : printTime(testElapsedTime))})` : '';313314const prettyContentFilter = (testSummary.contentFilterCount ? yellow(` (⚠️ content filter affected ${testSummary.contentFilterCount} runs)`) : '');315316const hadCacheMisses = testSummary.hasCacheMisses ? yellow(' (️️️💸 cache miss)') : '';317318console.log(` ${color(icon)} [${color(prettyScoreValue)}] ${color(test.fullName)}${prettyTestTime}${hadCacheMisses}${prettyContentFilter}`);319}320321function printVerbose(322opts: SimulationOptions,323testSummary: {324contentFilterCount: number;325results: ITestRunResult[];326}327) {328for (let i = 0; i < testSummary.results.length; i++) {329const result = testSummary.results[i];330331console.log(` ${i + 1} - ${result.kind === 'pass' ? green(result.kind) : red(result.kind)}`);332if (result.kind === 'fail' && result.message && opts.verbose !== 0) {333// indent the message and print334console.error(result.message.split(/\r\n|\r|\n/g).map(line => ` ${line}`).join('\n'));335}336}337}338339function updateGroupedScores({ test, currentScore, groupedScores }: {340test: SimulationTest;341currentScore: number;342groupedScores: Map<string, Map<string | undefined, Map<string | undefined, number[]>>>;343}) {344const suiteName = test.suite.fullName;345const model = test.model;346if (groupedScores.has(suiteName)) {347const scoresPerSuite = groupedScores.get(suiteName);348if (scoresPerSuite!.has(test.language)) {349const scoresPerLanguage = scoresPerSuite!.get(test.language);350if (scoresPerLanguage!.has(model)) {351scoresPerLanguage!.set(model, [...scoresPerLanguage!.get(model)!, currentScore]);352} else {353scoresPerLanguage?.set(model, [currentScore]);354}355} else {356scoresPerSuite!.set(test.language, new Map([[model, [currentScore]]]));357}358} else {359groupedScores.set(suiteName, new Map());360groupedScores.get(suiteName)!.set(test.language, new Map([[model, [currentScore]]]));361}362}363364type ExecuteTestOnceFn = (365ctx: SimulationTestContext,366parallelism: number,367outcomeDirectory: string,368test: SimulationTest,369runNumber: number,370) => Promise<ITestRunResult>;371372export const executeTestOnce = async (373ctx: SimulationTestContext,374parallelism: number,375outcomeDirectory: string,376test: SimulationTest,377runNumber: number,378isInRealExtensionHost = false,379) => {380const { opts, jsonOutputPrinter } = ctx;381const fetchRequestCollector = new FetchRequestCollector();382383const currentTestRunInfo: CurrentTestRunInfo = {384test,385testRunNumber: runNumber,386fetchRequestCollector: fetchRequestCollector,387isInRealExtensionHost,388};389390let testingServiceCollection: TestingServiceCollection;391try {392testingServiceCollection = await createSimulationAccessor(393ctx.modelConfig,394ctx.simulationServicesOptions,395currentTestRunInfo396);397} catch (e) {398const msg = e instanceof Error ? (e.stack ?? e.message) : String(e);399console.error(`Error in createSimulationAccessor`, e);400jsonOutputPrinter.print({ type: shared.OutputType.testRunStart, name: test.fullName, runNumber } satisfies shared.ITestRunStartOutput);401jsonOutputPrinter.print({402type: shared.OutputType.testRunEnd,403name: test.fullName,404runNumber,405duration: 0,406writtenFiles: [],407error: msg,408pass: false,409explicitScore: undefined,410annotations: undefined,411averageRequestDuration: undefined,412requestCount: 0,413hasCacheMiss: false,414} satisfies shared.ITestRunEndOutput);415return {416kind: 'fail',417message: msg,418contentFilterCount: 0,419duration: 0,420usage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },421outcome: { kind: 'failed', error: msg, hitContentFilter: false, critical: true },422cacheInfo: [],423hasCacheMiss: false,424} satisfies ITestRunResultFail;425}426427testingServiceCollection.define(ISimulationOutcome, ctx.simulationOutcome);428testingServiceCollection.define(ITokenizerProvider, ctx.tokenizerProvider);429testingServiceCollection.define(ISimulationEndpointHealth, ctx.simulationEndpointHealth);430testingServiceCollection.define(IJSONOutputPrinter, ctx.jsonOutputPrinter);431testingServiceCollection.define(ITasksService, new TestTasksService());432433if (test.model || test.embeddingType) {434// We prefer opts that come from the CLI over test specific args since Opts are global and must apply to the entire simulation435const smartChatModel = (opts.smartChatModel ?? opts.chatModel) ?? test.model;436const fastChatModel = (opts.fastChatModel ?? opts.chatModel) ?? test.model;437const fastRewriteModel = (opts.fastRewriteModel ?? opts.chatModel) ?? test.model;438testingServiceCollection.define(IEndpointProvider, new SyncDescriptor(TestEndpointProvider, [smartChatModel, fastChatModel, fastRewriteModel, currentTestRunInfo, opts.modelCacheMode === CacheMode.Disable, undefined]));439}440441const simulationTestRuntime = (ctx.externalScenariosPath !== undefined)442? new ExternalSimulationTestRuntime(ctx.outputPath, outcomeDirectory, runNumber)443: new SimulationTestRuntime(ctx.outputPath, outcomeDirectory, runNumber);444testingServiceCollection.define(ISimulationTestRuntime, simulationTestRuntime);445testingServiceCollection.define(ISimulationTestContext, simulationTestRuntime);446testingServiceCollection.define(ILogService, new SyncDescriptor(LogServiceImpl, [[new ConsoleLog(`🪵 ${currentTestRunInfo.test.fullName} (Run #${currentTestRunInfo.testRunNumber + 1}):\n`), simulationTestRuntime]]));447448testingServiceCollection.define(INewWorkspacePreviewContentManager, new SyncDescriptor(NewWorkspacePreviewContentManagerImpl));449450let snapshots: TestSnapshotsImpl | undefined;451if (test.options.location) {452snapshots = new TestSnapshotsImpl(test.options.location.path, test.fullName, runNumber);453testingServiceCollection.define(ITestSnapshots, snapshots);454}455456testingServiceCollection.define(IPromptWorkspaceLabels, new SyncDescriptor(PromptWorkspaceLabels));457if (isInRealExtensionHost) {458testingServiceCollection.define(IToolsService, new SyncDescriptor(SimulationExtHostToolsService, [ctx.simulationServicesOptions.disabledTools]));459} else {460testingServiceCollection.define(IToolsService, new SyncDescriptor(TestToolsService, [ctx.simulationServicesOptions.disabledTools]));461}462463jsonOutputPrinter.print({ type: shared.OutputType.testRunStart, name: test.fullName, runNumber } satisfies shared.ITestRunStartOutput);464if (process.stdout.isTTY && parallelism === 1) {465process.stdout.write(` Running scenario: ${test.fullName} - ${runNumber + 1}/${opts.nRuns}`.substring(0, process.stdout.columns - 1));466}467468const testStartTime = Date.now();469let pass = true;470let err: unknown | undefined;471try {472await test.run(testingServiceCollection);473await snapshots?.dispose();474475await fetchRequestCollector.complete();476477const result: ITestRunResultPass = {478kind: 'pass',479explicitScore: simulationTestRuntime.getExplicitScore(),480usage: fetchRequestCollector.usage,481contentFilterCount: fetchRequestCollector.contentFilterCount,482duration: Date.now() - testStartTime,483outcome: simulationTestRuntime.getOutcome(),484cacheInfo: fetchRequestCollector.cacheInfo,485hasCacheMiss: fetchRequestCollector.hasCacheMiss,486};487488return result;489} catch (e) {490pass = false;491err = e;492let msg = err instanceof Error ? (err.stack ? err.stack : err.message) : safeStringify(err);493await fetchRequestCollector.complete();494495let critical = false;496if (e instanceof BugIndicatingError || e instanceof TypeError) {497critical = true;498}499if (e instanceof CriticalError) {500critical = true;501msg = e.message;502}503504const result: ITestRunResultFail = {505kind: 'fail',506message: msg,507contentFilterCount: fetchRequestCollector.contentFilterCount,508duration: Date.now() - testStartTime,509usage: fetchRequestCollector.usage,510outcome: {511kind: 'failed',512error: msg,513hitContentFilter: fetchRequestCollector.contentFilterCount > 0,514critical,515},516cacheInfo: fetchRequestCollector.cacheInfo,517hasCacheMiss: fetchRequestCollector.hasCacheMiss,518};519520return result;521} finally {522// (context.safeGet(ILanguageFeaturesService) as { dispose?: () => Promise<void> })?.dispose?.();523524await simulationTestRuntime.writeFile(shared.SIMULATION_REQUESTS_FILENAME, JSON.stringify(fetchRequestCollector.interceptedRequests.map(r => r.toJSON()), undefined, 2), shared.REQUESTS_TAG);525526if (err) {527simulationTestRuntime.log(`Scenario failed due to an error:`, err);528if ((<any>err).code !== 'ERR_ASSERTION' && !(err instanceof IntentError)) {529// Make visible to the console unexpected errors530console.log(`Scenario ${test.fullName} failed due to an error:`);531console.log(err);532}533}534535await simulationTestRuntime.flushLogs();536537jsonOutputPrinter.print({538type: shared.OutputType.testRunEnd,539name: test.fullName,540runNumber,541duration: Date.now() - testStartTime,542writtenFiles: simulationTestRuntime.getWrittenFiles(),543error: err instanceof Error ? `${err.message}\n${err.stack}` : JSON.stringify(err),544pass,545explicitScore: simulationTestRuntime.getExplicitScore(),546annotations: simulationTestRuntime.getOutcome()?.annotations,547averageRequestDuration: fetchRequestCollector.averageRequestDuration,548requestCount: fetchRequestCollector.interceptedRequests.length,549hasCacheMiss: fetchRequestCollector.hasCacheMiss,550} satisfies shared.ITestRunEndOutput);551if (process.stdout.isTTY && parallelism === 1) {552process.stdout.write('\r\x1b[K');553}554555testingServiceCollection.dispose();556}557};558559/**560* When thrown, fails stest CI.561*/562export class CriticalError extends Error {563constructor(message: string) {564super(message);565this.name = 'CriticalError';566}567}568569570