CoCalc -- simulationMain.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/test/simulationMain.ts
¹³³⁸³ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5
// Load env
6
import * as dotenv from 'dotenv';
7
dotenv.config();
8

9
// Needed for better stack traces as captureLocation parses the stack trace to find stests
10
import 'source-map-support/register';
11

12
// Load other imports
13
import * as fs from 'fs';
14
import minimist from 'minimist';
15
import { createConnection } from 'net';
16
import * as path from 'path';
17
import * as v8 from 'v8';
18
import type * as vscodeType from 'vscode';
19
import { SimpleRPC } from '../src/extension/onboardDebug/node/copilotDebugWorker/rpc';
20
import { ISimulationModelConfig, createExtensionUnitTestingServices } from '../src/extension/test/node/services';
21
import { CHAT_MODEL } from '../src/platform/configuration/common/configurationService';
22
import { IEndpointProvider, ModelSupportedEndpoint } from '../src/platform/endpoint/common/endpointProvider';
23
import { IModelConfig } from '../src/platform/endpoint/test/node/openaiCompatibleEndpoint';
24
import { fileSystemServiceReadAsJSON } from '../src/platform/filesystem/common/fileSystemService';
25
import { LogLevel } from '../src/platform/log/common/logService';
26
import { ParserWithCaching } from '../src/platform/parser/node/parserWithCaching';
27
import { structureComputer } from '../src/platform/parser/node/structure';
28
import { NullTelemetryService } from '../src/platform/telemetry/common/nullTelemetryService';
29
import { TokenizerProvider } from '../src/platform/tokenizer/node/tokenizer';
30
import { assert } from '../src/util/vs/base/common/assert';
31
import { Cache } from './base/cache';
32
import { IChatMLCache } from './base/cachingChatMLFetcher';
33
import { usedResourceCaches } from './base/cachingResourceFetcher';
34
import { ChatMLSQLiteCache } from './base/chatMLCache';
35
import { CompletionsSQLiteCache, ICompletionsCache } from './base/completionsCache';
36
import { usedEmbeddingsCaches } from './base/embeddingsCache';
37
import { TestingCacheSalts } from './base/salts';
38
import { ICompleteBaselineComparison, IModifiedScenario, SimulationBaseline } from './base/simulationBaseline';
39
import { CacheMode, CurrentTestRunInfo, SimulationServicesOptions, createSimulationChatModelThrottlingTaskLaunchers, loadConfigFile } from './base/simulationContext';
40
import { ProxiedSimulationEndpointHealth, SimulationEndpointHealthImpl } from './base/simulationEndpointHealth';
41
import { BASELINE_RUN_COUNT, SimulationOptions } from './base/simulationOptions';
42
import { ProxiedSimulationOutcome, SimulationOutcomeImpl } from './base/simulationOutcome';
43
import { drainStdoutAndExit } from './base/stdout';
44
import { SimulationSuite, SimulationTest, SimulationTestsRegistry, createSimulationTestFilter } from './base/stest';
45
import { CollectingJSONOutputPrinter, ConsoleJSONOutputPrinter, IJSONOutputPrinter, ProxiedSONOutputPrinter } from './jsonOutputPrinter';
46
import { green, orange, red, violet, yellow } from './outputColorer';
47
import { runInputPipeline, runInputPipelineParallel } from './pipeline/pipeline';
48
import { ITestDiscoveryOptions, discoverTests } from './simulation/externalScenarios';
49
import { discoverCoffeTests } from './simulation/nesCoffeTests';
50
import { discoverNesTests } from './simulation/nesExternalTests';
51
import { OLD_BASELINE_FILENAME, OutputType, PRODUCED_BASELINE_FILENAME, REPORT_FILENAME, RUN_METADATA, SCORECARD_FILENAME, SIMULATION_FOLDER_NAME, generateOutputFolderName } from './simulation/shared/sharedTypes';
52
import { logger } from './simulationLogger';
53
import { IInitParams, IInitResult, IRunTestParams, IRunTestResult } from './testExecutionInExtension';
54
import { GroupedScores, ITestResult, SimulationTestContext, executeTestOnce, executeTests } from './testExecutor';
55
import { createScoreRenderer, fileExists, printTime } from './util';
56
const dotSimulationPath = path.join(__dirname, `../${SIMULATION_FOLDER_NAME}`);
57

58
async function main() {
59
	const errors: unknown[] = [];
60

61
	process.env['SIMULATION'] = '1';
62

63
	process.on('unhandledRejection', (reason, promise) => {
64
		console.error('\n\nUnhandled Rejection at: Promise', promise, 'reason:', reason);
65
		errors.push('unhandled rejection: ' + reason);
66
	});
67

68
	try {
69
		if (process.env.VSCODE_SIMULATION_EXTENSION_ENTRY) {
70
			await runInExtensionHost();
71
		} else {
72
			const opts = SimulationOptions.fromProcessArgs();
73
			const result = await run(opts);
74
			if (result) {
75
				errors.push(...result.errors);
76
			}
77
		}
78
	} catch (err) {
79
		errors.push(err?.stack || err?.message || String(err));
80
	}
81

82
	if (errors.length > 0) {
83
		console.error(`\n${red('⚠️⚠️⚠️  Command failed with:')}\n\n`);
84

85
		for (let i = 0; i < errors.length; i++) {
86
			const idx = `Error${errors.length === 1 ? '' : ` ${i + 1})`} `;
87
			console.error(`\t${idx}${errors[i]}\n\n`);
88
		}
89
	}
90

91
	await drainStdoutAndExit(errors.length === 0 ? 0 : 1);
92
}
93

94
type RunResult = void | { errors: unknown[] };
95

96
async function run(opts: SimulationOptions): Promise<RunResult> {
97
	const jsonOutputPrinter: IJSONOutputPrinter = opts.jsonOutput ? new ConsoleJSONOutputPrinter() : new CollectingJSONOutputPrinter();
98

99
	if (opts.externalCacheLayersPath) {
100
		process.env['EXTERNAL_CACHE_LAYERS_PATH'] = opts.externalCacheLayersPath;
101
	}
102

103
	switch (true) {
104
		case opts.help && opts.subcommand === 'nes-datagen':
105
			return opts.printTrainHelp();
106
		case opts.help:
107
			return opts.printHelp();
108
		case opts.listModels:
109
			await listChatModels(opts.modelCacheMode === CacheMode.Disable);
110
			return;
111
		case !!opts.nesDatagen:
112
			if (opts.parallelism > 1 && !opts.nesDatagen.workerMode) {
113
				await runInputPipelineParallel(opts);
114
			} else {
115
				await runInputPipeline(opts);
116
			}
117
			return;
118
		case opts.listSuites: // intentional fallthrough
119
		case opts.listTests: {
120
			// stest runner extension runs with both `list-tests` and `list-suites` flags, so they should not be mutually exclusive
121
			const { allSuites } = await loadTests(opts);
122

123
			if (opts.listSuites) {
124
				listSuites(allSuites, opts, jsonOutputPrinter);
125
			}
126

127
			if (opts.listTests) {
128
				listTests(allSuites, opts, jsonOutputPrinter);
129
			}
130

131
			return;
132
		}
133
		default:
134
			return runTests(opts, jsonOutputPrinter);
135
	}
136
}
137

138
async function runInExtensionHost() {
139
	const nodeOptions = process.env.NODE_OPTIONS;
140

141
	// Hook for the js-debug bootloader, which is not automatically executed in the extension host
142
	if (nodeOptions) {
143
		// NODE_OPTIONS is a CLI argument fragment that we need to parse here
144
		const regex = /"(?:\\.|[^"\\])*"|'(?:\\.|[^'\\])*'|\S+/g;
145
		const parsed = minimist(Array.from(nodeOptions.matchAll(regex), match => {
146
			let arg = match[0];
147
			// Remove surrounding quotes and unescape internal quotes if necessary
148
			if (arg[0] === arg.at(-1) && (arg[0] === '"' || arg[0] === '\'')) {
149
				arg = arg.slice(1, -1).replaceAll(`\\${arg[0]}`, arg[0]);
150
			}
151
			return arg;
152
		}));
153

154
		if (parsed.require) {
155
			const reqPaths = Array.isArray(parsed.require) ? parsed.require : [parsed.require];
156
			logger.info(`Loading NODE_OPTIONS require: ${reqPaths.join(', ')}`);
157
			reqPaths.forEach(r => require(r));
158
		}
159
	}
160

161
	const port = Number(process.env.VSCODE_SIMULATION_CONTROL_PORT);
162
	const rpc = await new Promise<SimpleRPC>((resolve, reject) => {
163
		const socket = createConnection({ host: '127.0.0.1', port });
164
		socket.on('connect', () => resolve(new SimpleRPC(socket)));
165
		socket.on('error', reject);
166
	});
167

168
	const vscode: typeof vscodeType = require('vscode');
169
	const folder = vscode.workspace.workspaceFolders![0];
170

171
	Cache.Instance.on('deviceCodeCallback', (url: string) => {
172
		rpc.callMethod('deviceCodeCallback', { url });
173
	});
174

175
	rpc.registerMethod('runTest', async (params: IRunTestParams): Promise<IRunTestResult> => {
176
		const { simulationTestContext, tests } = await allTests;
177

178
		simulationTestContext.baseline.clear();
179
		simulationTestContext.simulationEndpointHealth.failures.splice(0, simulationTestContext.simulationEndpointHealth.failures.length);
180

181
		const test = tests.get(params.testName);
182
		if (!test) {
183
			throw new Error(`Test ${params.testName} not found`);
184
		}
185

186
		const result = await executeTestOnce(
187
			simulationTestContext,
188
			1,
189
			params.outcomeDirectory,
190
			test,
191
			params.runNumber,
192
			true,
193
		);
194

195
		return { result };
196
	});
197

198
	const allTests = rpc.callMethod('init', { folder: folder.uri.fsPath } satisfies IInitParams).then(async (res: IInitResult) => {
199
		const opts = SimulationOptions.fromArray(res.argv);
200
		const { testsToRun } = await loadTests(opts);
201
		const { simulationTestContext } = await prepareTestEnvironment(opts, new ProxiedSONOutputPrinter(rpc), rpc);
202
		return { opts, tests: new Map(testsToRun.map(t => [t.fullName, t])), simulationTestContext };
203
	});
204

205
	return new Promise<void>(resolve => {
206
		rpc.registerMethod('close', async () => {
207
			resolve();
208
		});
209
	});
210
}
211

212
async function prepareTestEnvironment(opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter, rpcInExtensionHost?: SimpleRPC) {
213

214
	if (opts.verbose) {
215
		logger.setLogLevel(LogLevel.Trace);
216
	}
217

218
	// Configure caching
219
	if (opts.parallelism > 1) {
220
		// To get good cache behavior, we must increase the cache size considerably
221
		ParserWithCaching.CACHE_SIZE_PER_LANGUAGE = Math.max(5, 2 * opts.parallelism);
222
		structureComputer.setCacheSize(Math.max(5, 2 * opts.parallelism));
223
	}
224
	fileSystemServiceReadAsJSON.enable();
225

226
	const { allSuites, testsToRun, externalScenariosPath } = await loadTests(opts);
227

228
	let outputPath = opts.output;
229
	if (outputPath === undefined) {
230
		outputPath = path.join(dotSimulationPath, generateOutputFolderName());
231
	} else {
232
		// If it's not an absolute path, make it relative to the current working directory
233
		if (!path.isAbsolute(outputPath)) {
234
			outputPath = path.join(process.cwd(), outputPath);
235
		}
236
	}
237
	if (!rpcInExtensionHost) { // don't clean if we're just one participant in a larger run
238
		await clearOrCreateDir(outputPath);
239
	}
240

241
	jsonOutputPrinter.print({
242
		type: OutputType.initialTestSummary,
243
		runOutputFolderName: path.basename(outputPath),
244
		testsToRun: testsToRun.map(t => t.fullName),
245
		nRuns: opts.nRuns
246
	});
247

248
	const allTests = allSuites.flatMap(cur => cur.tests);
249
	const hasFilteredTests = testsToRun.length !== allTests.length;
250

251
	if (!opts.jsonOutput) {
252
		if (hasFilteredTests) {
253
			console.log(`Due to grep filters, will execute ${testsToRun.length} out of ${allTests.length} simulations. Each simulation runs ${opts.nRuns} time(s).\n`);
254
		} else {
255
			console.log(`Will execute ${testsToRun.length} simulations. Each simulation runs ${opts.nRuns} time(s).\n`);
256
		}
257
	}
258

259

260
	writeHeapSnapshot(opts.heapSnapshots, 'before');
261

262
	const canUseBaseline = (opts.nRuns === BASELINE_RUN_COUNT); // only use baseline if running N times
263
	const runningAllTests = (opts.grep === undefined && opts.omitGrep === undefined);
264

265
	const baselinePath = opts.externalBaseline
266
		? (
267
			assert(opts.externalScenarios !== undefined, 'externalBaseline must be set only with externalScenarios'),
268
			path.join(opts.externalScenarios, 'baseline.json')
269
		)
270
		: SimulationBaseline.DEFAULT_BASELINE_PATH;
271

272
	const baseline = await SimulationBaseline.readFromDisk(baselinePath, runningAllTests);
273

274
	if (canUseBaseline) { // copy current baseline as the baseline before the run
275
		await fs.promises.copyFile(baseline.baselinePath, path.join(outputPath, OLD_BASELINE_FILENAME));
276
	}
277

278
	const configs = opts.configFile ? loadConfigFile(opts.configFile) : undefined;
279

280
	return {
281
		...createSimulationTestContext(opts, runningAllTests, baseline, canUseBaseline, jsonOutputPrinter, outputPath, externalScenariosPath, rpcInExtensionHost, configs),
282
		testsToRun,
283
		baseline,
284
		canUseBaseline,
285
		outputPath,
286
		runningAllTests,
287
		hasFilteredTests,
288
	};
289

290
}
291

292
async function runTests(opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter): Promise<RunResult> {
293
	const errors: unknown[] = [];
294

295
	Cache.Instance.on('deviceCodeCallback', (url: string) => {
296
		if (opts.jsonOutput) {
297
			jsonOutputPrinter.print({ type: OutputType.deviceCodeCallback, url });
298
		} else {
299
			console.log(`⚠️ \x1b[31mAuth Required!\x1b[0m Please open the link: ${url}`);
300
		}
301
	});
302

303
	const { simulationEndpointHealth, simulationOutcome, simulationTestContext, testsToRun, baseline, canUseBaseline, outputPath, runningAllTests, hasFilteredTests } = await prepareTestEnvironment(opts, jsonOutputPrinter);
304

305
	if (opts.gc) {
306
		if (opts.gc && opts.externalCacheLayersPath) {
307
			throw new Error('--gc is currently not compatible with --external-cache-layers-path');
308
		}
309
		Cache.Instance.gcStart();
310
	}
311

312
	const totalStartTime = Date.now();
313
	const { testResultsPromises, getGroupedScores } = await executeTests(simulationTestContext, testsToRun);
314

315
	console.log('Waiting on test results...');
316

317
	const testResults = await Promise.all(testResultsPromises);
318

319
	writeHeapSnapshot(opts.heapSnapshots, 'after');
320

321
	const totalTime = Date.now() - totalStartTime;
322

323
	if (opts.gc) {
324
		Cache.Instance.gcEnd();
325
	}
326

327
	for (const result of testResults) {
328
		for (const [idx, o] of result.outcomes.entries()) {
329
			if (o?.kind === 'failed' && o.critical) {
330
				errors.push(`Test failed: ${result.test}, run ${idx}\n` + o.error);
331
			}
332
		}
333
	}
334

335
	// this allows to quickly identify which new cache entries were created in this particular simulation run
336
	if (opts.stageCacheEntries && !opts.externalScenarios) {
337
		// TODO@joaomoreno
338
		console.warn('!!! Determining new cache entries is not yet working in Redis, ask Joao to implement it');
339
	}
340

341
	const groupedScores = await getGroupedScores();
342
	printOutcome(groupedScores, testsToRun, baseline, opts, canUseBaseline, runningAllTests, testResults, totalTime);
343

344
	const tableData = buildScoreTable(groupedScores);
345
	const suiteScoreCard = path.join(outputPath, SCORECARD_FILENAME);
346
	await fs.promises.writeFile(suiteScoreCard, toCsv(tableData));
347

348
	if (simulationOutcome instanceof SimulationOutcomeImpl) {
349
		if (!opts.noCachePointer) {
350
			await simulationOutcome.write();
351
		}
352

353
		if (!opts.externalScenarios && !hasFilteredTests) {
354
			await simulationOutcome.cleanFolder();
355
		}
356
	}
357

358
	if (canUseBaseline) {
359
		await baseline.writeToDisk(path.join(outputPath, PRODUCED_BASELINE_FILENAME));
360
	}
361

362
	if (opts.isUpdateBaseline) {
363
		if (canUseBaseline) {
364
			await baseline.writeToDisk();
365
		} else {
366
			errors.push(`Cannot update baseline for ${opts.nRuns} run(s). Please use --n=${BASELINE_RUN_COUNT}.`);
367
		}
368
	}
369

370
	await jsonOutputPrinter.flush?.(outputPath);
371

372
	const filePath = path.join(outputPath, REPORT_FILENAME);
373
	await fs.promises.writeFile(filePath, JSON.stringify(testResults, null, '\t'));
374

375
	if (opts.label) {
376
		const runMetadata = path.join(outputPath, RUN_METADATA);
377
		await fs.promises.writeFile(runMetadata, JSON.stringify({ label: opts.label }, null, '\t'));
378
	}
379

380
	// Enable if you want to see which cache entries were used in this simulation run
381
	const writeUsedOtherCaches = false;
382
	if (writeUsedOtherCaches) {
383
		await fs.promises.writeFile('other-caches.json', JSON.stringify(
384
			([] as string[])
385
				.concat(Array.from(usedEmbeddingsCaches))
386
				.concat(Array.from(usedResourceCaches))
387
		));
388
	}
389

390
	if (opts.ci && !opts.isUpdateBaseline) {
391
		const changeStats = baseline.compare();
392
		const error = validateChangeStats(changeStats);
393
		if (error) {
394
			errors.push(red(`${error.errorMessage}. Please run 'npm run simulate-update-baseline' and check in baseline.json.`));
395
		}
396
	} else {
397
		if (simulationEndpointHealth.failures.length > 0) {
398
			const rateLimitedCount = simulationEndpointHealth.failures.filter(f => f.request.type === 'rateLimited').length;
399
			const failedCount = simulationEndpointHealth.failures.filter(f => f.request.type === 'failed').length;
400

401
			// If there were simulation endpoint failures and we are doing a
402
			// CI baseline update, fail the CI so that we block PR merge
403
			if (opts.ci && opts.isUpdateBaseline) {
404
				errors.push(
405
					red(`Encountered server failures while running simulation: ${rateLimitedCount} rate limited responses, ${failedCount} other failed responses. Please rerun the simulation!`),
406
					...simulationEndpointHealth.failures.map(f => `- ${f.testInfo.testName}: ${f.request.reason}`),
407
				);
408
			}
409
		}
410
	}
411

412
	return { errors };
413
}
414

415
async function loadTests(opts: SimulationOptions) {
416
	let allSuites: readonly SimulationSuite[] = [];
417
	let testsToRun: readonly SimulationTest[] = [];
418

419
	let externalScenariosPath = opts.externalScenarios;
420
	if (externalScenariosPath) {
421
		let usageError = false;
422
		if (!opts.inline && !opts.sidebar && !opts.nes) {
423
			usageError = true;
424
			console.error(`Missing --inline or --sidebar or --nes flag`);
425
		}
426
		if ([opts.inline, opts.sidebar, opts.nes].filter(Boolean).length > 1) {
427
			usageError = true;
428
			console.error(`Can only have one of --inline or --sidebar or --nes flags set`);
429
		}
430

431
		if (typeof opts.output !== 'string') {
432
			usageError = true;
433
			console.error(`Missing --output flag`);
434
		}
435

436
		if (usageError) { // process.exit() if there's a usage error
437
			console.error(`Usage: npm run simulate -- --external-scenarios=<path> --inline --output=<path>`);
438
			console.error(`Usage: npm run simulate -- --external-scenarios=<path> --sidebar --output=<path>`);
439
			await drainStdoutAndExit(1);
440
		}
441

442
		// Update paths to be absolute
443
		// If it's not an absolute path, make it relative to the current working directory
444
		if (!path.isAbsolute(externalScenariosPath)) {
445
			externalScenariosPath = path.join(process.cwd(), externalScenariosPath);
446
		}
447

448
		if (opts.scenarioTest) {
449
			SimulationTestsRegistry.setInputPath(externalScenariosPath);
450
		} else {
451
			const filter = createSimulationTestFilter(opts.grep, opts.omitGrep);
452
			if (opts.nes) {
453
				if (opts.nes === 'external') {
454
					// run external stests
455
					allSuites = [await discoverNesTests(externalScenariosPath, opts)];
456
				} else {
457
					// run coffe stests
458
					allSuites = [await discoverCoffeTests(externalScenariosPath, opts)];
459
				}
460
			} else {
461
				const testDiscoveryOptions: ITestDiscoveryOptions = {
462
					chatKind: (opts.inline && !opts.sidebar) ? 'inline' : 'panel',
463
					applyChatCodeBlocks: opts.applyChatCodeBlocks,
464
				};
465
				allSuites = await discoverTests(externalScenariosPath, testDiscoveryOptions);
466
			}
467
			testsToRun = allSuites
468
				.flatMap(suite => suite.tests)
469
				.filter(filter)
470
				.sort((t0, t1) => t0.fullName.localeCompare(t1.fullName));
471
		}
472
	}
473

474
	if (testsToRun.length === 0) {
475
		SimulationTestsRegistry.setFilters(opts.scenarioTest, opts.grep, opts.omitGrep);
476
		await import('./simulationTests');
477
		allSuites = SimulationTestsRegistry.getAllSuites();
478
		testsToRun = SimulationTestsRegistry.getAllTests();
479
	}
480
	return { allSuites, testsToRun, externalScenariosPath };
481
}
482

483
function listSuites(allSuites: readonly SimulationSuite[], opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter) {
484
	for (const suite of allSuites) {
485
		jsonOutputPrinter.print({ type: OutputType.detectedSuite, name: suite.fullName, location: suite.options.location });
486
	}
487
}
488

489
function listTests(allSuites: readonly SimulationSuite[], opts: SimulationOptions, jsonOutputPrinter: IJSONOutputPrinter) {
490
	// we should just list all tests
491
	const allTests = allSuites.flatMap(suite => suite.tests);
492
	for (const test of allTests) {
493
		jsonOutputPrinter.print({ type: OutputType.detectedTest, suiteName: test.suite.fullName, name: test.fullName, location: test.options.location });
494
		if (!opts.jsonOutput) {
495
			console.log(` - ${test.fullName}`);
496
		}
497
	}
498
}
499

500
async function listChatModels(skipCache: boolean = false) {
501
	const accessor = createExtensionUnitTestingServices(undefined, undefined, { skipModelMetadataCache: skipCache }).createTestingAccessor();
502
	const endpointProvider = accessor.get(IEndpointProvider);
503
	const chatEndpoints = await endpointProvider.getAllChatEndpoints();
504
	console.log('Available Chat Models:\n');
505

506
	// Group models by family
507
	const modelsByFamily = new Map<string, string[]>();
508

509
	for (const endpoint of chatEndpoints) {
510
		const family = endpoint.family || 'Other'; // Default family name if not specified
511
		if (!modelsByFamily.has(family)) {
512
			modelsByFamily.set(family, []);
513
		}
514
		modelsByFamily.get(family)!.push(endpoint.model);
515
	}
516

517
	// Print each family with its models
518
	const tableData: { Family: string; Models: string }[] = [];
519

520
	// Convert to array and sort by family name for consistent display
521
	const sortedFamilies = Array.from(modelsByFamily.entries()).sort((a, b) => a[0].localeCompare(b[0]));
522

523
	for (const [family, models] of sortedFamilies) {
524
		// Sort models within each family
525
		models.sort();
526
		tableData.push({
527
			Family: family,
528
			Models: models.join(', ')
529
		});
530
	}
531

532
	console.table(tableData);
533
	return;
534
}
535

536
function createSimulationTestContext(
537
	opts: SimulationOptions,
538
	runningAllTests: boolean,
539
	baseline: SimulationBaseline,
540
	canUseBaseline: boolean,
541
	jsonOutputPrinter: IJSONOutputPrinter,
542
	outputPath: string,
543
	externalScenariosPath: string | undefined,
544
	rpcInExtensionHost: SimpleRPC | undefined,
545
	configs: Record<string, unknown> | undefined,
546
) {
547
	const simulationEndpointHealth = rpcInExtensionHost ? new ProxiedSimulationEndpointHealth(rpcInExtensionHost) : new SimulationEndpointHealthImpl();
548

549
	let createChatMLCache: ((info: CurrentTestRunInfo) => IChatMLCache) | undefined;
550
	let createNesFetchCache: ((info: CurrentTestRunInfo) => ICompletionsCache) | undefined;
551

552
	if (opts.lmCacheMode === CacheMode.Disable) {
553
		console.warn('❗ Not using any cache');
554
		createChatMLCache = undefined;
555
		createNesFetchCache = undefined;
556
	} else {
557
		createChatMLCache = (info: CurrentTestRunInfo) => new ChatMLSQLiteCache(TestingCacheSalts.requestCacheSalt, info);
558
		createNesFetchCache = (info: CurrentTestRunInfo) => new CompletionsSQLiteCache(TestingCacheSalts.nesFetchCacheSalt, info);
559
	}
560

561
	const simulationServicesOptions: SimulationServicesOptions = {
562
		createChatMLCache,
563
		createNesFetchCache,
564
		chatModelThrottlingTaskLaunchers: createSimulationChatModelThrottlingTaskLaunchers(opts.boost),
565
		isNoFetchModeEnabled: opts.noFetch,
566
		languageModelCacheMode: opts.lmCacheMode,
567
		resourcesCacheMode: opts.resourcesCacheMode,
568
		disabledTools: opts.disabledTools,
569
		summarizeHistory: opts.summarizeHistory,
570
		swebenchPrompt: opts.swebenchPrompt,
571
		useExperimentalCodeSearchService: opts.useExperimentalCodeSearchService,
572
		configs
573
	};
574

575
	const customModelConfigMap: Map<string, IModelConfig> = new Map();
576
	if (opts.modelConfigFile) {
577
		console.log('Using model configuration file: ' + opts.modelConfigFile);
578
		const customModelConfigs = parseModelConfigFile(opts.modelConfigFile);
579
		customModelConfigs.forEach(config => {
580
			customModelConfigMap.set(config.id, config);
581
		});
582
	}
583

584
	const modelConfig: ISimulationModelConfig = {
585
		chatModel: opts.chatModel,
586
		fastChatModel: opts.fastChatModel,
587
		smartChatModel: opts.smartChatModel,
588
		embeddingType: opts.embeddingType,
589
		fastRewriteModel: opts.fastRewriteModel,
590
		skipModelMetadataCache: opts.modelCacheMode === CacheMode.Disable,
591
		customModelConfigs: customModelConfigMap,
592
	};
593

594

595
	const simulationOutcome = rpcInExtensionHost ? new ProxiedSimulationOutcome(rpcInExtensionHost) : new SimulationOutcomeImpl(runningAllTests);
596

597
	const simulationTestContext: SimulationTestContext = {
598
		opts,
599
		baseline,
600
		canUseBaseline,
601
		jsonOutputPrinter,
602
		outputPath,
603
		externalScenariosPath,
604
		modelConfig,
605
		simulationServicesOptions,
606
		simulationOutcome,
607
		simulationEndpointHealth,
608
		tokenizerProvider: new TokenizerProvider(false, new NullTelemetryService()) // this is expensive so we share it across all stests
609
	};
610
	return { simulationTestContext, simulationEndpointHealth, simulationOutcome };
611
}
612

613
function printOutcome(
614
	groupedScores: GroupedScores,
615
	testsToRun: readonly SimulationTest[],
616
	baseline: SimulationBaseline,
617
	opts: SimulationOptions,
618
	canUseBaseline: boolean,
619
	runningAllTests: boolean,
620
	testResults: ITestResult[],
621
	totalTime: number
622
): void {
623
	const shouldShowSummaries = (testsToRun.length >= 10); // only when running at least 10 tests
624
	const shouldBeBrief = (testsToRun.length === 1); // when running a single test, be brief
625

626
	if (shouldShowSummaries) {
627
		const modelComparisonTable = [];
628
		for (const [suiteName, scoresPerSuite] of groupedScores.entries()) {
629
			const testScores = new Map<string, { count: number; scoreSum: number }>();
630
			for (const [_language, scoresPerLanguage] of scoresPerSuite.entries()) {
631
				for (const [model, scoresPerModel] of scoresPerLanguage.entries()) {
632
					if (!model) {
633
						continue;
634
					}
635
					const data = testScores.get(model) || { count: 0, scoreSum: 0 };
636
					data.count += scoresPerModel.length;
637
					data.scoreSum += scoresPerModel.reduce((acc, curr) => acc + curr, 0);
638
					testScores.set(model, data);
639
				}
640
			}
641
			let modelCount = 0;
642
			modelCount += (testScores.has(CHAT_MODEL.GPT41) ? 1 : 0);
643
			modelCount += (testScores.has(CHAT_MODEL.GPT4OMINI) ? 1 : 0);
644
			if (modelCount > 1) {
645
				const gpt4o = testScores.get(CHAT_MODEL.GPT41) ?? { count: 0, scoreSum: 0 };
646
				const gpt4oMini = testScores.get(CHAT_MODEL.GPT4OMINI) ?? { count: 0, scoreSum: 0 };
647
				const row = {
648
					Suite: suiteName,
649
					'# of tests': (gpt4o.count === 0 || gpt4oMini.count === 0) ? gpt4o.count || gpt4oMini.count : `${gpt4o.count} <> ${gpt4oMini.count}`, 'GPT-4o': gpt4o.count ? Number(gpt4o.scoreSum / gpt4o.count * 100).toFixed(2) : '-',
650
					'GPT-4o-mini': gpt4oMini.count ? Number(gpt4oMini.scoreSum / gpt4oMini.count * 100).toFixed(2) : '-',
651
				};
652

653
				modelComparisonTable.push(row);
654
			}
655
		}
656
		if (modelComparisonTable.length !== 0) {
657
			console.log(`\n${yellow('Suite Summary by Model:')}`);
658
			console.table(modelComparisonTable);
659
		}
660

661
		console.log(`\n${yellow('Suite Summary by Language:')}`);
662
		const tableData = buildScoreTable(groupedScores);
663
		console.table(tableData);
664
	}
665

666
	const changeStats = baseline.compare();
667
	const scoreToString = createScoreRenderer(opts, canUseBaseline);
668
	const printChanged = (changedScenarios: IModifiedScenario[]) => {
669
		for (const scenario of changedScenarios) {
670
			const prettyScore = `${scoreToString(scenario.prevScore)} -> ${scoreToString(scenario.currScore)}`;
671
			const color = scenario.currScore > scenario.prevScore ? green : red;
672
			console.log(`  - [${color(prettyScore)}] ${scenario.name}`);
673
		}
674
	};
675
	if (canUseBaseline) {
676
		console.log(`\nSummary:`);
677
		if (!shouldBeBrief && !runningAllTests) {
678
			console.log(`  Tests Score: ${baseline.currentScore.toFixed(2)}%`);
679
		}
680
		if (!shouldBeBrief) {
681
			console.log(`Overall Score: ${baseline.overallScore.toFixed(2)}%`);
682
		}
683
		if (changeStats.nImproved > 0) {
684
			console.log(`${green('▲')} - Score improved in ${changeStats.nImproved} scenarios`);
685
		}
686
		if (changeStats.nWorsened > 0) {
687
			console.log(`${red('▼')} - Score decreased in ${changeStats.nWorsened} scenarios`);
688
		}
689
	} else {
690
		if (!shouldBeBrief) {
691
			console.log(`\n${yellow(`Approximate Summary (due to using --n=${opts.nRuns} instead of --n=${BASELINE_RUN_COUNT}):`)}`);
692
			const score = testResults.reduce((prev, curr) => prev + curr.score, 0);
693
			console.log(`Overall Approximate Score: ${(score / testsToRun.length * 100).toFixed(2)} / 100`);
694
		}
695
		if (changeStats.nImproved > 0) {
696
			console.log(`${green('▲')} - Score clearly improved in ${changeStats.nImproved} scenarios`);
697
		}
698
		if (changeStats.nWorsened > 0) {
699
			console.log(`${red('▼')} - Score clearly decreased in ${changeStats.nWorsened} scenarios`);
700
		}
701
	}
702
	if (changeStats.nUnchanged > 0) {
703
		console.log(`= - Score unchanged in ${changeStats.nUnchanged} scenarios`);
704
	}
705
	if (changeStats.addedScenarios > 0) {
706
		console.log(`${violet('◆')} - New scenarios count - ${changeStats.addedScenarios}`);
707
	}
708
	if (changeStats.removedScenarios > 0) {
709
		console.log(`${orange('●')} - Missing ${changeStats.removedScenarios} scenarios.`);
710
	}
711
	if (changeStats.skippedScenarios > 0) {
712
		console.log(`${yellow('●')} - Skipped ${changeStats.skippedScenarios} scenarios.`);
713
	}
714

715
	if (changeStats.improvedScenarios.length > 0 || changeStats.worsenedScenarios.length > 0) {
716
		console.log();
717
	}
718
	if (changeStats.improvedScenarios.length > 0) {
719
		console.log(`${green('Improved')}:`);
720
		printChanged(changeStats.improvedScenarios);
721
	}
722
	if (changeStats.worsenedScenarios.length > 0) {
723
		console.log(`${red('Worsened')}:`);
724
		printChanged(changeStats.worsenedScenarios);
725
	}
726

727
	console.log(`\n  Simulation finished(${printTime(totalTime)}) \n`);
728
}
729

730
function buildScoreTable(groupedScores: GroupedScores): object[] {
731
	const tableData: object[] = [];
732
	for (const [suiteName, scoresPerSuite] of groupedScores.entries()) {
733
		for (const [language, scoresPerLanguage] of scoresPerSuite.entries()) {
734
			for (const [model, scoresPerModel] of scoresPerLanguage.entries()) {
735
				const row = {
736
					Suite: suiteName,
737
					Language: language ?? '-',
738
					Model: model ?? '-',
739
					'# of tests': scoresPerModel.length,
740
					'Score(%)': Number((scoresPerModel.reduce((acc, curr) => acc + curr, 0) / scoresPerModel.length * 100).toFixed(2)),
741
				};
742
				tableData.push(row);
743
			}
744
		}
745
	}
746
	return tableData;
747
}
748

749
function validateChangeStats(changeStats: ICompleteBaselineComparison): { errorMessage: string } | undefined {
750
	if (changeStats.nWorsened > 0) {
751
		// if any worsened, fail
752
		return { errorMessage: 'Some scenarios have worsened' };
753
	}
754
	if (changeStats.nImproved > 0) {
755
		// if any improved, fail
756
		return { errorMessage: 'Some scenarios have improved' };
757
	}
758
	if (changeStats.addedScenarios > 0) {
759
		// if any added, fail
760
		return { errorMessage: 'New scenarios detected' };
761
	}
762
	if (changeStats.removedScenarios > 0) {
763
		// if any removed, fail
764
		return { errorMessage: 'Some scenarios were removed' };
765
	}
766
	if (changeStats.mandatory.skippedScenarios > 0) {
767
		// only fail if mandatory scenarios are skipped
768
		return { errorMessage: 'Some mandatory scenarios were skipped' };
769
	}
770
	return undefined;
771
}
772

773
function writeHeapSnapshot(snapshotFilename: boolean | string | undefined, label: 'before' | 'after') {
774
	if (snapshotFilename === undefined || snapshotFilename === false) {
775
		return;
776
	}
777

778
	const fileName = typeof snapshotFilename === 'string' ? `${snapshotFilename}-${label}.heapsnapshot` : undefined;
779
	console.log(`Writing heap snapshot: ${v8.writeHeapSnapshot(fileName)}`);
780
}
781

782
async function clearOrCreateDir(path: string) {
783
	if (await fileExists(path)) {
784
		await fs.promises.rm(path, { recursive: true, force: true });
785
	}
786
	await fs.promises.mkdir(path, { recursive: true });
787
}
788

789
function toCsv(rows: object[]): string {
790
	if (rows.length === 0) { return ''; }
791

792
	const header = Object.keys(rows[0]).join(',') + '\n';
793
	const rowsStr = rows.map(obj => Object.values(obj).join(',') + '\n').join('');
794

795
	return header + rowsStr;
796
}
797

798
function parseModelConfigFile(modelConfigFilePath: string): IModelConfig[] {
799
	const resolvedModelConfigFilePath = path.isAbsolute(modelConfigFilePath) ? modelConfigFilePath : path.join(process.cwd(), modelConfigFilePath);
800
	const configFileContents = fs.readFileSync(resolvedModelConfigFilePath, 'utf-8');
801

802
	let modelConfig: any;
803
	try {
804
		modelConfig = JSON.parse(configFileContents);
805
	} catch (error) {
806
		throw new Error(`Invalid JSON configuration file ${resolvedModelConfigFilePath}: ${error.message}`);
807
	}
808

809
	if (!modelConfig || typeof modelConfig !== 'object') {
810
		throw new Error('Invalid configuration file ' + resolvedModelConfigFilePath);
811
	}
812

813
	/**
814
	 * the modelConfigFile.json should contain objects of the form:
815
	```
816
		"<model id>": {
817
			"name": "<model name>",
818
			"version": "<model version>",
819
			"type": "<model type>", // 'openai' or 'azureOpenai'
820
			"useDeveloperRole": <boolean>, // optional, defaults to false
821
			"url": "<endpoint URL>",
822
			"capabilities"?: {
823
				"supports"?: {
824
					"parallel_tool_calls"?: <boolean>,
825
					"streaming"?: <boolean>,
826
					"tool_calls"?: <boolean>,
827
					"vision"?: <boolean>,
828
					"prediction"?: <boolean>
829
				},
830
				"limits"?: {
831
					"max_prompt_tokens"?: <number>,
832
					"max_output_tokens"?: <number>,
833
					"max_context_window_tokens"?: <number>
834
				}
835
			},
836
			"auth?": {
837
				"useBearerHeader"?: <boolean>, // Use Bearer token for authentication. Defaults to false
838
				"useApiKeyHeader"?: <boolean>, // Use API key for authentication. Defaults to false
839
				"apiKeyEnvName": "<environment variable name for API key to be used for the above headers>"
840
			},
841
			"overrides"?: {
842
				"requestHeaders"?: { "<header name>": "<header value>" }, // optional, custom request headers
843
				"temperature"?: <number> | null, // optional, if null removes from request body
844
				"top_p"?: <number> | null, // optional, if null removes from request body
845
				"snippy"?: <boolean> | null, // optional, if null removes from request body
846
				"max_tokens"?: <number> | null, // optional, if null removes from request body
847
				"max_completion_tokens"?: <number> | null, // optional, if null removes from request body
848
				"intent"?: <boolean> | null // optional, if null removes from request body
849
			}
850
		},
851
		...
852
	```
853
	*/
854

855
	const checkProperty = (obj: any, prop: string, type: 'string' | 'boolean' | 'number' | 'object', optional?: boolean, nullable?: boolean) => {
856
		if (!(prop in obj)) {
857
			if (optional) {
858
				return;
859
			}
860
			throw new Error(`Missing property '${prop}' in model configuration file ${resolvedModelConfigFilePath}`);
861
		}
862

863
		if (nullable && obj[prop] === null) {
864
			return;
865
		}
866

867
		if (typeof obj[prop] !== type) {
868
			throw new Error(`Property '${prop}' in model configuration file ${resolvedModelConfigFilePath} must be of type '${type}', but got '${typeof obj[prop]}'`);
869
		}
870
	};
871

872
	const modelConfigs: IModelConfig[] = [];
873
	for (const modelId in modelConfig) {
874
		const model = modelConfig[modelId];
875
		if (typeof model !== 'object') {
876
			throw new Error(`Model configuration for '${modelId}' must be an object`);
877
		}
878
		checkProperty(model, 'name', 'string');
879
		checkProperty(model, 'version', 'string');
880
		checkProperty(model, 'type', 'string');
881
		if (model.type !== 'openai' && model.type !== 'azureOpenai') {
882
			throw new Error(`Model type '${model.type}' is not supported. Only 'openai' and 'azureOpenai' are allowed.`);
883
		}
884
		checkProperty(model, 'useDeveloperRole', 'boolean', true);
885
		checkProperty(model, 'url', 'string');
886

887
		checkProperty(model, 'capabilities', 'object', true);
888
		checkProperty(model.capabilities, 'supports', 'object', true);
889
		if (model.capabilities?.supports) {
890
			checkProperty(model.capabilities.supports, 'parallel_tool_calls', 'boolean', true);
891
			checkProperty(model.capabilities.supports, 'streaming', 'boolean', true);
892
			checkProperty(model.capabilities.supports, 'tool_calls', 'boolean', true);
893
			checkProperty(model.capabilities.supports, 'vision', 'boolean', true);
894
			checkProperty(model.capabilities.supports, 'prediction', 'boolean', true);
895
			checkProperty(model.capabilities.supports, 'thinking', 'boolean', true);
896
		}
897

898
		checkProperty(model.capabilities, 'limits', 'object', true);
899
		if (model.capabilities?.limits) {
900
			checkProperty(model.capabilities.limits, 'max_prompt_tokens', 'number', true);
901
			checkProperty(model.capabilities.limits, 'max_output_tokens', 'number', true);
902
			checkProperty(model.capabilities.limits, 'max_context_window_tokens', 'number', true);
903
		}
904

905
		checkProperty(model, 'auth', 'object', true);
906
		if (model.auth) {
907
			checkProperty(model.auth, 'useBearerHeader', 'boolean', true);
908
			checkProperty(model.auth, 'useApiKeyHeader', 'boolean', true);
909
			checkProperty(model.auth, 'apiKeyEnvName', 'string');
910
		}
911

912
		checkProperty(model, 'overrides', 'object', true);
913
		if (model.overrides) {
914
			const overrides = model.overrides;
915
			checkProperty(overrides, 'requestHeaders', 'object', true, true);
916
			checkProperty(overrides, 'temperature', 'number', true, true);
917
			checkProperty(overrides, 'top_p', 'number', true, true);
918
			checkProperty(overrides, 'snippy', 'boolean', true, true);
919
			checkProperty(overrides, 'intent', 'boolean', true, true);
920
			checkProperty(overrides, 'max_tokens', 'number', true, true);
921
			checkProperty(overrides, 'max_completion_tokens', 'number', true, true);
922
		}
923

924
		// Validate supported_endpoints
925
		if (model.supported_endpoints) {
926
			if (!Array.isArray(model.supported_endpoints)) {
927
				throw new Error(`Property 'supported_endpoints' in model configuration file ${resolvedModelConfigFilePath} must be an array`);
928
			}
929
			for (const endpointSuffix of model.supported_endpoints) {
930
				if (!Object.values(ModelSupportedEndpoint).includes(endpointSuffix as ModelSupportedEndpoint)) {
931
					throw new Error(`Invalid endpoint suffix '${endpointSuffix}' in supported_endpoints for model '${modelId}'. Must be one of: ${Object.values(ModelSupportedEndpoint).join(', ')}`);
932
				}
933
			}
934
		}
935

936
		modelConfigs.push({
937
			id: modelId,
938
			name: model.name,
939
			version: model.version,
940
			type: model.type,
941
			useDeveloperRole: model.useDeveloperRole ?? false,
942
			url: model.url,
943
			capabilities: {
944
				supports: {
945
					parallel_tool_calls: model.capabilities?.supports?.parallel_tool_calls ?? false,
946
					streaming: model.capabilities?.supports?.streaming ?? false,
947
					tool_calls: model.capabilities?.supports?.tool_calls ?? false,
948
					vision: model.capabilities?.supports?.vision ?? false,
949
					prediction: model.capabilities?.supports?.prediction ?? false,
950
					thinking: model.capabilities?.supports?.thinking ?? false
951
				},
952
				limits: {
953
					max_prompt_tokens: model.capabilities?.limits?.max_prompt_tokens ?? 128000,
954
					max_output_tokens: model.capabilities?.limits?.max_output_tokens ?? Number.MAX_SAFE_INTEGER,
955
					max_context_window_tokens: model.capabilities?.limits?.max_context_window_tokens
956
				}
957
			},
958
			supported_endpoints: model.supported_endpoints?.length ? model.supported_endpoints as ModelSupportedEndpoint[] : [ModelSupportedEndpoint.ChatCompletions],
959
			auth: {
960
				useBearerHeader: model.auth?.useBearerHeader ?? false,
961
				useApiKeyHeader: model.auth?.useApiKeyHeader ?? false,
962
				apiKeyEnvName: model.auth?.apiKeyEnvName
963
			},
964
			overrides: {
965
				requestHeaders: model.overrides?.hasOwnProperty('requestHeaders') ? model.overrides.requestHeaders : {},
966
				temperature: model.overrides?.hasOwnProperty('temperature') ? model.overrides.temperature : undefined,
967
				top_p: model.overrides?.hasOwnProperty('top_p') ? model.overrides.top_p : undefined,
968
				snippy: model.overrides?.hasOwnProperty('snippy') ? model.overrides.snippy : undefined,
969
				intent: model.overrides?.hasOwnProperty('intent') ? model.overrides.intent : undefined,
970
				max_tokens: model.overrides?.hasOwnProperty('max_tokens') ? model.overrides.max_tokens : undefined,
971
				max_completion_tokens: model.overrides?.hasOwnProperty('max_completion_tokens') ? model.overrides.max_completion_tokens : undefined,
972
			}
973
		});
974
	}
975

976
	return modelConfigs;
977
}
978

979
(async () => main())();
980

981
Product

Resources

Company