CoCalc -- test-chat-perf-regression.js

GitHub Repository: microsoft/vscode
Path: blob/main/scripts/chat-simulation/test-chat-perf-regression.js
¹³³⁷⁹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
// @ts-check
7

8
/**
9
 * Chat performance benchmark.
10
 *
11
 * Uses the real copilot extension with IS_SCENARIO_AUTOMATION=1 and a local
12
 * mock LLM server. Measures the full stack: prompt building, context
13
 * gathering, tool resolution, rendering, GC, and layout overhead.
14
 *
15
 * Usage:
16
 *   npm run perf:chat                                 # all scenarios vs 1.115.0
17
 *   npm run perf:chat -- --runs 10                    # 10 runs per scenario
18
 *   npm run perf:chat -- --scenario text-only         # single scenario
19
 *   npm run perf:chat -- --no-baseline                # skip baseline comparison
20
 *   npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0
21
 *   npm run perf:chat -- --resume .chat-simulation-data/2026-04-14/results.json --runs 3
22
 */
23

24
const path = require('path');
25
const fs = require('fs');
26
const {
27
	ROOT, DATA_DIR, METRIC_DEFS, loadConfig,
28
	resolveBuild, isVersionString, buildEnv, buildArgs, prepareRunDir,
29
	robustStats, welchTTest, summarize, markDuration, launchVSCode,
30
	getNextExtHostInspectPort, connectToExtHostInspector, getRepoRoot,
31
} = require('./common/utils');
32
const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server');
33
const { registerPerfScenarios, getScenarioDescription } = require('./common/perf-scenarios');
34

35
// -- Config (edit config.jsonc to change defaults) ---------------------------
36

37
const CONFIG = loadConfig('perfRegression');
38

39
// -- CLI args ----------------------------------------------------------------
40

41
function parseArgs() {
42
	const args = process.argv.slice(2);
43
	const opts = {
44
		runs: CONFIG.runsPerScenario ?? 5,
45
		verbose: false,
46
		ci: false,
47
		noCache: false,
48
		force: false,
49
		heapSnapshots: false,
50
		/** @type {string[]} */
51
		scenarios: [],
52
		/** @type {string | undefined} */
53
		build: undefined,
54
		/** @type {string | undefined} */
55
		baseline: undefined,
56
		/** @type {string | undefined} */
57
		baselineBuild: CONFIG.baselineBuild ?? '1.115.0',
58
		saveBaseline: false,
59
		threshold: CONFIG.regressionThreshold ?? 0.2,
60
		/** @type {Record<string, number | string>} */
61
		metricThresholds: CONFIG.metricThresholds ?? {},
62
		/** @type {string | undefined} */
63
		resume: undefined,
64
		productionBuild: false,
65
		/** @type {Record<string, any>} */
66
		settingsOverrides: {},
67
		/** @type {Record<string, any>} */
68
		testSettingsOverrides: {},
69
		/** @type {Record<string, any>} */
70
		baselineSettingsOverrides: {},
71
		cleanupDiagnostics: false,
72
	};
73
	for (let i = 0; i < args.length; i++) {
74
		switch (args[i]) {
75
			case '--runs': opts.runs = parseInt(args[++i], 10); break;
76
			case '--verbose': opts.verbose = true; break;
77
			case '--scenario': case '-s': opts.scenarios.push(args[++i]); break;
78
			case '--build': case '-b': opts.build = args[++i]; break;
79
			case '--baseline': opts.baseline = args[++i]; break;
80
			case '--baseline-build': opts.baselineBuild = args[++i]; break;
81
			case '--no-baseline': opts.baselineBuild = undefined; break;
82
			case '--save-baseline': opts.saveBaseline = true; break;
83
			case '--threshold': opts.threshold = parseFloat(args[++i]); break;
84
			case '--resume': opts.resume = args[++i]; break;
85
			case '--production-build': opts.productionBuild = true; break;
86
			case '--setting': case '--test-setting': case '--baseline-setting': {
87
				const kv = args[++i];
88
				const eq = kv.indexOf('=');
89
				if (eq === -1) { console.error(`${args[i - 1]} requires key=value, got: ${kv}`); process.exit(1); }
90
				const key = kv.slice(0, eq);
91
				const raw = kv.slice(eq + 1);
92
				// Parse booleans and numbers, keep rest as strings
93
				const val = raw === 'true' ? true : raw === 'false' ? false : /^-?\d+(\.\d+)?$/.test(raw) ? Number(raw) : raw;
94
				const flag = args[i - 1];
95
				if (flag === '--test-setting') { opts.testSettingsOverrides[key] = val; }
96
				else if (flag === '--baseline-setting') { opts.baselineSettingsOverrides[key] = val; }
97
				else { opts.settingsOverrides[key] = val; }
98
				break;
99
			}
100
			case '--no-cache': opts.noCache = true; break;
101
			case '--force': opts.force = true; break;
102
			case '--heap-snapshots': opts.heapSnapshots = true; break;
103
			case '--ci': opts.ci = true; opts.noCache = true; opts.heapSnapshots = true; opts.cleanupDiagnostics = true; break;
104
			case '--cleanup-diagnostics': opts.cleanupDiagnostics = true; break;
105
			case '--help': case '-h':
106
				console.log([
107
					'Chat performance benchmark',
108
					'',
109
					'Options:',
110
					'  --runs <n>          Number of runs per scenario (default: 5)',
111
					'  --scenario <id>     Scenario to run (repeatable; default: all)',
112
					'  --build <path|ver>  Path to VS Code build, or a version to download',
113
					'                       (e.g. "1.110.0", "insiders", commit hash, or local path)',
114
					'  --baseline <path>   Compare against a baseline JSON file',
115
					'  --baseline-build <v> Version or path to benchmark as baseline',
116
					'                       (e.g. "1.115.0", "insiders", commit hash, or local path)',
117
					'  --no-baseline        Skip baseline comparison entirely',
118
					'  --save-baseline     Save results as the new baseline (requires --baseline <path>)',
119
					'  --resume <path>     Resume a previous run, adding more iterations to increase',
120
					'                       confidence. Merges new runs with existing rawRuns data',
121
					'  --threshold <frac>  Regression threshold fraction (default: 0.2 = 20%)',
122
					'  --production-build  Build a local bundled package (via gulp vscode) for',
123
					'                       apples-to-apples comparison against a release baseline',
124
					'  --setting <k=v>     Set a VS Code setting override for all builds (repeatable)',
125
					'  --test-setting <k=v> Set a VS Code setting override for test build only',
126
					'  --baseline-setting <k=v> Set a VS Code setting override for baseline build only',
127
					'                       e.g. --setting chat.experimental.incrementalRendering.enabled=true',
128
					'  --no-cache          Ignore cached baseline data, always run fresh',
129
					'  --force             Skip build mode mismatch confirmation',
130
					'  --heap-snapshots    Take heap snapshots (slow; auto-enabled in --ci mode)',
131
					'  --ci                CI mode: write Markdown summary to ci-summary.md (implies --no-cache, --heap-snapshots, --cleanup-diagnostics)',
132
					'  --cleanup-diagnostics  Remove heap snapshots, CPU profiles, and traces after each run to save disk space',
133
					'  --verbose           Print per-run details',
134
					'',
135
					'Scenarios: ' + getScenarioIds().join(', '),
136
				].join('\n'));
137
				process.exit(0);
138
		}
139
	}
140
	if (opts.scenarios.length === 0) {
141
		opts.scenarios = getScenarioIds();
142
	} else {
143
		const knownIds = new Set(getScenarioIds());
144
		const unknown = opts.scenarios.filter(s => !knownIds.has(s));
145
		if (unknown.length > 0) {
146
			console.error(`Unknown scenario(s): ${unknown.join(', ')}\nAvailable: ${[...knownIds].join(', ')}`);
147
			process.exit(1);
148
		}
149
	}
150
	return opts;
151
}
152

153
// -- Build mode detection ----------------------------------------------------
154

155
/**
156
 * Classify an electron path into a build mode.
157
 * @param {string} electronPath
158
 * @returns {'dev' | 'production' | 'release'}
159
 */
160
function detectBuildMode(electronPath) {
161
	if (electronPath.includes('.vscode-test')) {
162
		return 'release';
163
	}
164
	if (electronPath.includes('VSCode-')) {
165
		return 'production';
166
	}
167
	return 'dev';
168
}
169

170
/**
171
 * Return a human-readable label for a build mode.
172
 * @param {'dev' | 'production' | 'release'} mode
173
 * @returns {string}
174
 */
175
function buildModeLabel(mode) {
176
	switch (mode) {
177
		case 'dev': return 'development (unbundled)';
178
		case 'production': return 'production (bundled, local)';
179
		case 'release': return 'release (bundled, downloaded)';
180
	}
181
}
182

183
// -- Production build --------------------------------------------------------
184

185
/**
186
 * Build a local production (bundled) VS Code package using `gulp vscode`.
187
 * Returns the path to the Electron executable in the packaged output.
188
 *
189
 * The gulp task compiles TypeScript, bundles JS, and packages with Electron
190
 * into `../VSCode-<platform>-<arch>/`.  This is the same process used for
191
 * release builds, minus minification and mangling.
192
 */
193
function buildProductionBuild() {
194
	const product = require(path.join(ROOT, 'product.json'));
195
	const platform = process.platform;
196
	const arch = process.arch;
197
	const destDir = path.join(ROOT, '..', `VSCode-${platform}-${arch}`);
198

199
	console.log('[chat-simulation] Building local production package (gulp vscode)...');
200
	console.log('[chat-simulation] This may take a few minutes on the first run.');
201

202
	const { execSync } = require('child_process');
203
	try {
204
		execSync('npm run gulp -- vscode', {
205
			cwd: ROOT,
206
			stdio: 'inherit',
207
			timeout: 10 * 60 * 1000, // 10 minute timeout
208
		});
209
	} catch (e) {
210
		// The copilot shim step may fail locally when the copilot SDK is not
211
		// fully packaged (it is normally supplied via CI).  As long as the
212
		// Electron executable was produced we can still benchmark.
213
		console.warn('[chat-simulation] gulp vscode exited with errors (see above). Checking if executable was still produced...');
214
	}
215

216
	/** @type {string} */
217
	let electronPath;
218
	if (platform === 'darwin') {
219
		electronPath = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'MacOS', product.nameShort);
220
	} else if (platform === 'linux') {
221
		electronPath = path.join(destDir, product.applicationName);
222
	} else {
223
		electronPath = path.join(destDir, `${product.nameShort}.exe`);
224
	}
225

226
	if (!fs.existsSync(electronPath)) {
227
		console.error(`[chat-simulation] Production build failed — executable not found at: ${electronPath}`);
228
		process.exit(1);
229
	}
230

231
	// Merge product.overrides.json into the packaged product.json.
232
	// The overrides file contains extensionsGallery and other config that
233
	// the OSS product.json lacks.  In dev builds these are loaded at
234
	// runtime when VSCODE_DEV is set, but the production build doesn't
235
	// set that flag so we bake them in.
236
	const overridesPath = path.join(ROOT, 'product.overrides.json');
237
	if (fs.existsSync(overridesPath)) {
238
		/** @type {string} */
239
		let appDir;
240
		if (platform === 'darwin') {
241
			appDir = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'Resources', 'app');
242
		} else {
243
			appDir = path.join(destDir, 'resources', 'app');
244
		}
245
		const packagedProductPath = path.join(appDir, 'product.json');
246
		if (fs.existsSync(packagedProductPath)) {
247
			const packagedProduct = JSON.parse(fs.readFileSync(packagedProductPath, 'utf-8'));
248
			const overrides = JSON.parse(fs.readFileSync(overridesPath, 'utf-8'));
249
			const merged = Object.assign(packagedProduct, overrides);
250
			fs.writeFileSync(packagedProductPath, JSON.stringify(merged, null, '\t'));
251
			console.log('[chat-simulation] Merged product.overrides.json into packaged product.json');
252
		}
253
	}
254

255
	console.log(`[chat-simulation] Production build ready: ${electronPath}`);
256
	return electronPath;
257
}
258

259
/**
260
 * @typedef {{ type: 'fraction', value: number } | { type: 'absolute', value: number }} MetricThreshold
261
 */
262

263
/**
264
 * Parse a metric threshold value from config.
265
 * - A number is treated as a fraction (e.g. 0.2 = 20%).
266
 * - A string like "100ms" or "5" is treated as an absolute delta.
267
 * @param {number | string} raw
268
 * @returns {MetricThreshold}
269
 */
270
function parseMetricThreshold(raw) {
271
	if (typeof raw === 'number') {
272
		return { type: 'fraction', value: raw };
273
	}
274
	// Strip unit suffix (ms, MB, etc.) and parse the number
275
	const num = parseFloat(raw);
276
	if (isNaN(num)) {
277
		throw new Error(`Invalid metric threshold: ${raw}`);
278
	}
279
	return { type: 'absolute', value: num };
280
}
281

282
/**
283
 * Get the regression threshold for a specific metric.
284
 * Uses per-metric override from config if available, otherwise the global threshold.
285
 * @param {{ threshold: number, metricThresholds?: Record<string, number | string> }} opts
286
 * @param {string} metric
287
 * @returns {MetricThreshold}
288
 */
289
function getMetricThreshold(opts, metric) {
290
	const raw = opts.metricThresholds?.[metric];
291
	if (raw !== undefined) {
292
		return parseMetricThreshold(raw);
293
	}
294
	return { type: 'fraction', value: opts.threshold };
295
}
296

297
/**
298
 * Check whether a change exceeds the threshold.
299
 * @param {MetricThreshold} threshold
300
 * @param {number} change - fractional change (e.g. 0.5 = 50% increase)
301
 * @param {number} absoluteDelta - absolute difference (cur.median - bas.median)
302
 * @returns {boolean}
303
 */
304
function exceedsThreshold(threshold, change, absoluteDelta) {
305
	if (threshold.type === 'absolute') {
306
		return absoluteDelta > threshold.value;
307
	}
308
	return change > threshold.value;
309
}
310

311
// -- Metrics -----------------------------------------------------------------
312

313
/**
314
 * @typedef {{
315
 *   timeToUIUpdated: number,
316
 *   timeToFirstToken: number,
317
 *   timeToComplete: number,
318
 *   timeToRenderComplete: number,
319
 *   instructionCollectionTime: number,
320
 *   agentInvokeTime: number,
321
 *   heapUsedBefore: number,
322
 *   heapUsedAfter: number,
323
 *   heapDelta: number,
324
 *   heapDeltaPostGC: number,
325
 *   majorGCs: number,
326
 *   minorGCs: number,
327
 *   gcDurationMs: number,
328
 *   layoutCount: number,
329
 *   layoutDurationMs: number,
330
 *   recalcStyleCount: number,
331
 *   forcedReflowCount: number,
332
 *   longTaskCount: number,
333
 *   longAnimationFrameCount: number,
334
 *   longAnimationFrameTotalMs: number,
335
 *   frameCount: number,
336
 *   compositeLayers: number,
337
 *   paintCount: number,
338
 *   hasInternalMarks: boolean,
339
 *   responseHasContent: boolean,
340
 *   internalFirstToken: number,
341
 *   profilePath: string,
342
 *   tracePath: string,
343
 *   snapshotPath: string,
344
 *   extHostHeapUsedBefore: number,
345
 *   extHostHeapUsedAfter: number,
346
 *   extHostHeapDelta: number,
347
 *   extHostHeapDeltaPostGC: number,
348
 *   extHostProfilePath: string,
349
 *   extHostSnapshotPath: string,
350
 * }} RunMetrics
351
 */
352

353
// -- Single run --------------------------------------------------------------
354

355
/**
356
 * @param {string} electronPath
357
 * @param {string} scenario
358
 * @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise<void>, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise<void> }} mockServer
359
 * @param {boolean} verbose
360
 * @param {string} runIndex
361
 * @param {string} runDir - timestamped run directory for diagnostics
362
 * @param {'baseline' | 'test'} role - whether this is a baseline or test run
363
 * @param {Record<string, any>} [settingsOverrides] - custom VS Code settings
364
 * @param {{ heapSnapshots?: boolean }} [runOpts] - additional run options
365
 * @returns {Promise<RunMetrics>}
366
 */
367
async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role, settingsOverrides, runOpts) {
368
	const takeHeapSnapshots = runOpts?.heapSnapshots ?? false;
369
	const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer, settingsOverrides);
370
	const isDevBuild = !electronPath.includes('.vscode-test') && !electronPath.includes('VSCode-');
371
	// Extract a clean build label from the path.
372
	// Dev:          .build/electron/Code - OSS.app/.../Code - OSS  → "dev"
373
	// Stable:       .vscode-test/vscode-darwin-arm64-1.115.0/Visual Studio Code.app/.../Electron → "1.115.0"
374
	// Production:   ../VSCode-darwin-arm64/Code - OSS.app/.../Code - OSS → "production"
375
	let buildLabel = 'dev';
376
	if (!isDevBuild) {
377
		const vscodeTestMatch = electronPath.match(/vscode-test\/vscode-[^/]*?-(\d+\.\d+\.\d+)/);
378
		if (vscodeTestMatch) {
379
			buildLabel = vscodeTestMatch[1];
380
		} else if (electronPath.includes('VSCode-')) {
381
			buildLabel = 'production';
382
		} else {
383
			buildLabel = path.basename(electronPath);
384
		}
385
	}
386

387
	// For dev builds from a different repo, derive the repo root from the
388
	// electron path so that the build loads its own out/ source code.
389
	const appRoot = isDevBuild ? (getRepoRoot(electronPath) || ROOT) : ROOT;
390
	if (isDevBuild && appRoot !== ROOT) {
391
		if (verbose) {
392
			console.log(`  [debug] Using appRoot from electron path: ${appRoot}`);
393
		}
394
	}
395

396
	// Create a per-run diagnostics directory: <runDir>/<role>-<build>/<scenario>-<i>/
397
	const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, ''));
398
	fs.mkdirSync(runDiagDir, { recursive: true });
399

400
	const tracePath = path.join(runDiagDir, 'trace.json');
401
	const extHostInspectPort = getNextExtHostInspectPort();
402
	const vscode = await launchVSCode(
403
		electronPath,
404
		buildArgs(userDataDir, extDir, logsDir, { isDevBuild, extHostInspectPort, traceFile: tracePath, appRoot }),
405
		buildEnv(mockServer, { isDevBuild }),
406
		{ verbose },
407
	);
408
	activeVSCode = vscode;
409
	const window = vscode.page;
410

411
	// Declared outside try so the finally block can clean up
412
	/** @type {{ send: (method: string, params?: any) => Promise<any>, on: (event: string, listener: (params: any) => void) => void, close: () => void } | null} */
413
	let extHostInspector = null;
414
	/** @type {{ usedSize: number, totalSize: number } | null} */
415
	let extHostHeapBefore = null;
416
	/** @type {Omit<RunMetrics, 'majorGCs' | 'minorGCs' | 'gcDurationMs' | 'longTaskCount' | 'longAnimationFrameCount' | 'longAnimationFrameTotalMs' | 'timeToUIUpdated' | 'timeToFirstToken' | 'timeToComplete' | 'timeToRenderComplete' | 'layoutDurationMs' | 'instructionCollectionTime' | 'agentInvokeTime' | 'hasInternalMarks' | 'internalFirstToken'> | null} */
417
	let partialMetrics = null;
418
	// Timing vars hoisted for access in post-close trace parsing
419
	let submitTime = 0;
420
	let firstResponseTime = 0;
421
	let responseCompleteTime = 0;
422
	let renderCompleteTime = 0;
423

424
	try {
425
		await window.waitForSelector('.monaco-workbench', { timeout: 60_000 });
426

427
		const cdp = await window.context().newCDPSession(window);
428
		await cdp.send('Performance.enable');
429
		const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
430

431
		const metricsBefore = await cdp.send('Performance.getMetrics');
432

433
		// Open chat
434
		const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI';
435
		await window.keyboard.press(chatShortcut);
436

437
		const CHAT_VIEW = 'div[id="workbench.panel.chat"]';
438
		const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`;
439

440
		await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 });
441
		await window.waitForFunction(
442
			(selector) => Array.from(document.querySelectorAll(selector)).some(el => {
443
				const rect = el.getBoundingClientRect();
444
				return rect.width > 0 && rect.height > 0;
445
			}),
446
			chatEditorSel, { timeout: 15_000 },
447
		);
448

449
		// Dismiss dialogs
450
		const dismissDialog = async () => {
451
			for (const sel of ['.chat-setup-dialog', '.dialog-shadow', '.monaco-dialog-box']) {
452
				const el = await window.$(sel);
453
				if (el) { await window.keyboard.press('Escape'); await new Promise(r => setTimeout(r, 500)); break; }
454
			}
455
		};
456
		await dismissDialog();
457

458
		// Wait for extension activation
459
		const reqsBefore = mockServer.requestCount();
460
		try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { }
461
		if (verbose) {
462
			console.log(`  [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`);
463
		}
464

465
		// Connect to extension host inspector for profiling/heap data
466
		try {
467
			extHostInspector = await connectToExtHostInspector(extHostInspectPort, { verbose, timeoutMs: 15_000 });
468
			await extHostInspector.send('HeapProfiler.enable');
469
			await extHostInspector.send('Profiler.enable');
470
			await extHostInspector.send('Profiler.start');
471
			extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage');
472
			if (verbose && extHostHeapBefore) {
473
				console.log(`  [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`);
474
			}
475
		} catch (err) {
476
			if (verbose) {
477
				console.log(`  [ext-host] Could not connect to inspector: ${err}`);
478
			}
479
		}
480

481
		// Wait for model resolution
482
		await new Promise(r => setTimeout(r, 3000));
483
		await dismissDialog();
484

485
		// Focus input
486
		await window.click(chatEditorSel);
487
		const focusStart = Date.now();
488
		while (Date.now() - focusStart < 5_000) {
489
			const focused = await window.evaluate((sel) => {
490
				const el = document.querySelector(sel);
491
				return el && (el.classList.contains('focused') || el.contains(document.activeElement));
492
			}, chatEditorSel).catch(() => false);
493
			if (focused) { break; }
494
			await new Promise(r => setTimeout(r, 50));
495
		}
496

497
		// Type message — use the smoke-test driver's typeInEditor when available
498
		// (dev builds), fall back to pressSequentially for stable/insiders builds.
499
		const chatMessage = `[scenario:${scenario}] Explain how this code works`;
500
		const actualInputSelector = await window.evaluate((editorSel) => {
501
			const editor = document.querySelector(editorSel);
502
			if (!editor) { throw new Error('Chat editor not found'); }
503
			return editor.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea';
504
		}, chatEditorSel);
505

506
		const hasDriver = await window.evaluate(() =>
507
			// @ts-ignore
508
			!!globalThis.driver?.typeInEditor
509
		).catch(() => false);
510

511
		if (hasDriver) {
512
			await window.evaluate(({ selector, text }) => {
513
				// @ts-ignore
514
				return globalThis.driver.typeInEditor(selector, text);
515
			}, { selector: actualInputSelector, text: chatMessage });
516
		} else {
517
			// Fallback: click the input element and use pressSequentially
518
			await window.click(actualInputSelector);
519
			await new Promise(r => setTimeout(r, 200));
520
			await window.locator(actualInputSelector).pressSequentially(chatMessage, { delay: 0 });
521
		}
522

523
		// Start CPU profiler to capture call stacks during the interaction
524
		await cdp.send('Profiler.enable');
525
		await cdp.send('Profiler.start');
526

527
		// Submit
528
		const completionsBefore = mockServer.completionCount();
529
		submitTime = Date.now();
530
		await window.keyboard.press('Enter');
531

532
		// Wait for mock server to serve the response
533
		try { await mockServer.waitForCompletion(completionsBefore + 1, 60_000); } catch { }
534
		firstResponseTime = Date.now();
535

536
		// Wait for DOM response to settle
537
		await dismissDialog();
538
		const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`;
539
		await window.waitForFunction(
540
			(sel) => {
541
				const responses = document.querySelectorAll(sel);
542
				if (responses.length === 0) { return false; }
543
				return !responses[responses.length - 1].classList.contains('chat-response-loading');
544
			},
545
			responseSelector, { timeout: 30_000 },
546
		);
547
		responseCompleteTime = Date.now();
548

549
		// -- User turn injection loop -----------------------------------------
550
		// For multi-turn scenarios with user follow-ups, type each follow-up
551
		// message and wait for the model's response to settle.
552
		const userTurns = getUserTurns(scenario);
553
		for (let ut = 0; ut < userTurns.length; ut++) {
554
			const userTurn = userTurns[ut];
555
			if (verbose) {
556
				console.log(`  [debug] User follow-up ${ut + 1}/${userTurns.length}: "${userTurn.message}"`);
557
			}
558

559
			// Brief pause to let the UI settle between turns
560
			await new Promise(r => setTimeout(r, 500));
561

562
			// Focus the chat input
563
			await window.click(chatEditorSel);
564
			const utFocusStart = Date.now();
565
			while (Date.now() - utFocusStart < 3_000) {
566
				const focused = await window.evaluate((sel) => {
567
					const el = document.querySelector(sel);
568
					return el && (el.classList.contains('focused') || el.contains(document.activeElement));
569
				}, chatEditorSel).catch(() => false);
570
				if (focused) { break; }
571
				await new Promise(r => setTimeout(r, 50));
572
			}
573

574
			// Type the follow-up message
575
			if (hasDriver) {
576
				await window.evaluate(({ selector, text }) => {
577
					// @ts-ignore
578
					return globalThis.driver.typeInEditor(selector, text);
579
				}, { selector: actualInputSelector, text: userTurn.message });
580
			} else {
581
				await window.click(actualInputSelector);
582
				await new Promise(r => setTimeout(r, 200));
583
				await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 });
584
			}
585

586
			// Submit follow-up
587
			const utCompBefore = mockServer.completionCount();
588
			await window.keyboard.press('Enter');
589

590
			// Wait for mock server to serve the response for this turn
591
			try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { }
592

593
			// Wait for the new response to finish rendering.
594
			// The chat list is virtualized — old response elements are
595
			// recycled out of the DOM as new ones appear, so we cannot
596
			// rely on counting DOM elements. Instead, scroll to the
597
			// bottom and wait for no response to be in loading state.
598
			await dismissDialog();
599
			await window.evaluate((chatViewSel) => {
600
				const input = document.querySelector(chatViewSel + ' .interactive-input-part');
601
				if (input) { input.scrollIntoView({ block: 'end' }); }
602
			}, CHAT_VIEW);
603
			await new Promise(r => setTimeout(r, 200));
604

605
			await window.waitForFunction(
606
				(sel) => {
607
					const responses = document.querySelectorAll(sel);
608
					if (responses.length === 0) { return false; }
609
					return !responses[responses.length - 1].classList.contains('chat-response-loading');
610
				},
611
				responseSelector,
612
				{ timeout: 30_000 },
613
			);
614
			responseCompleteTime = Date.now();
615

616
			if (verbose) {
617
				const utResponseInfo = await window.evaluate((sel) => {
618
					const responses = document.querySelectorAll(sel);
619
					const last = responses[responses.length - 1];
620
					return last ? (last.textContent || '').substring(0, 150) : '(empty)';
621
				}, responseSelector);
622
				console.log(`  [debug] Follow-up response (first 150 chars): ${utResponseInfo}`);
623
			}
624
		}
625

626
		// Stop CPU profiler and save the profile
627
		const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop'));
628
		const profilePath = path.join(runDiagDir, 'profile.cpuprofile');
629
		fs.writeFileSync(profilePath, JSON.stringify(profile));
630
		if (verbose) {
631
			console.log(`  [debug] CPU profile saved to ${profilePath}`);
632
		}
633

634
		const responseInfo = await window.evaluate((sel) => {
635
			const responses = document.querySelectorAll(sel);
636
			const last = responses[responses.length - 1];
637
			if (!last) { return { hasContent: false, text: '' }; }
638
			const text = last.textContent || '';
639
			return { hasContent: text.trim().length > 0, text: text.substring(0, 200) };
640
		}, responseSelector);
641

642
		if (verbose) {
643
			console.log(`  [debug] Response content (first 200 chars): ${responseInfo.text}`);
644
			console.log(`  [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`);
645
		}
646

647
		// Wait for the typewriter animation to finish rendering.
648
		// The chat UI animates streamed content word-by-word after the
649
		// response stream completes. We need to wait until all content
650
		// is rendered before capturing layout/style metrics, otherwise
651
		// we miss the rendering phase where batching optimizations matter.
652
		await window.waitForFunction(
653
			(sel) => {
654
				const responses = document.querySelectorAll(sel);
655
				const last = responses[responses.length - 1];
656
				if (!last) { return true; }
657
				// The typewriter animation is done when there are no
658
				// elements with the 'typewriter' or 'animating' class,
659
				// and no pending cursor animations.
660
				const hasAnimating = last.querySelector('.chat-animated-word, .chat-typewriter-cursor');
661
				return !hasAnimating;
662
			},
663
			responseSelector,
664
			{ timeout: 30_000 },
665
		).catch(() => {
666
			// Fallback: if the selector-based check doesn't work (e.g.
667
			// the CSS classes differ across versions), wait for content
668
			// to stabilize by polling textContent.
669
		});
670

671
		// Additional stabilization: poll until textContent stops changing.
672
		// This catches any remaining animation regardless of CSS class names.
673
		{
674
			let prev = '';
675
			let stableCount = 0;
676
			const stabilizeStart = Date.now();
677
			while (stableCount < 3 && Date.now() - stabilizeStart < 10_000) {
678
				const current = await window.evaluate((sel) => {
679
					const responses = document.querySelectorAll(sel);
680
					const last = responses[responses.length - 1];
681
					return last ? (last.textContent || '') : '';
682
				}, responseSelector).catch(() => '');
683
				if (current === prev) {
684
					stableCount++;
685
				} else {
686
					stableCount = 0;
687
					prev = current;
688
				}
689
				await new Promise(r => setTimeout(r, 100));
690
			}
691
		}
692
		renderCompleteTime = Date.now();
693
		if (verbose) {
694
			console.log(`  [debug] Render stabilized: ${renderCompleteTime - responseCompleteTime}ms after stream complete`);
695
		}
696

697
		const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
698
		const metricsAfter = await cdp.send('Performance.getMetrics');
699

700
		// -- Extension host metrics (non-snapshot) ---------------------------
701
		let extHostHeapUsedBefore = -1;
702
		let extHostHeapUsedAfter = -1;
703
		let extHostHeapDelta = -1;
704
		let extHostHeapDeltaPostGC = -1;
705
		let extHostProfilePath = '';
706
		let extHostSnapshotPath = '';
707
		if (extHostInspector && extHostHeapBefore) {
708
			try {
709
				extHostHeapUsedBefore = Math.round(extHostHeapBefore.usedSize / 1024 / 1024);
710

711
				// Stop CPU profiler and save
712
				const extProfile = await extHostInspector.send('Profiler.stop');
713
				extHostProfilePath = path.join(runDiagDir, 'exthost-profile.cpuprofile');
714
				fs.writeFileSync(extHostProfilePath, JSON.stringify(extProfile.profile));
715
				if (verbose) {
716
					console.log(`  [ext-host] CPU profile saved to ${extHostProfilePath}`);
717
				}
718

719
				// Heap usage after interaction
720
				const extHostHeapAfter = await extHostInspector.send('Runtime.getHeapUsage');
721
				extHostHeapUsedAfter = Math.round(extHostHeapAfter.usedSize / 1024 / 1024);
722
				extHostHeapDelta = extHostHeapUsedAfter - extHostHeapUsedBefore;
723

724
				// Force GC and measure retained heap
725
				try {
726
					await extHostInspector.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
727
					await new Promise(r => setTimeout(r, 200));
728
					const extHostHeapPostGC = await extHostInspector.send('Runtime.getHeapUsage');
729
					extHostHeapDeltaPostGC = Math.round(extHostHeapPostGC.usedSize / 1024 / 1024) - extHostHeapUsedBefore;
730
				} catch {
731
					extHostHeapDeltaPostGC = -1;
732
				}
733

734
				if (verbose) {
735
					console.log(`  [ext-host] Heap: before=${extHostHeapUsedBefore}MB, after=${extHostHeapUsedAfter}MB, delta=${extHostHeapDelta}MB, deltaPostGC=${extHostHeapDeltaPostGC}MB`);
736
				}
737
			} catch (err) {
738
				if (verbose) {
739
					console.log(`  [ext-host] Error collecting metrics: ${err}`);
740
				}
741
			}
742
		}
743

744
		// -- Heap snapshots (opt-in, parallelized) ---------------------------
745
		let snapshotPath = '';
746
		if (takeHeapSnapshots) {
747
			const snapshotPromises = [];
748

749
			// Renderer snapshot
750
			snapshotPromises.push((async () => {
751
				const p = path.join(runDiagDir, 'heap.heapsnapshot');
752
				await cdp.send('HeapProfiler.enable');
753
				const chunks = /** @type {string[]} */ ([]);
754
				cdp.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
755
					chunks.push(params.chunk);
756
				});
757
				await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
758
				fs.writeFileSync(p, chunks.join(''));
759
				return p;
760
			})());
761

762
			// Extension host snapshot (parallel with renderer)
763
			if (extHostInspector && extHostHeapBefore) {
764
				snapshotPromises.push((async () => {
765
					const p = path.join(runDiagDir, 'exthost-heap.heapsnapshot');
766
					const chunks = /** @type {string[]} */ ([]);
767
					extHostInspector.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
768
						chunks.push(params.chunk);
769
					});
770
					await extHostInspector.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
771
					fs.writeFileSync(p, chunks.join(''));
772
					return p;
773
				})());
774
			}
775

776
			const snapshotResults = await Promise.all(snapshotPromises);
777
			snapshotPath = snapshotResults[0];
778
			if (snapshotResults.length > 1) {
779
				extHostSnapshotPath = snapshotResults[1];
780
			}
781

782
			if (verbose) {
783
				console.log(`  [debug] Renderer snapshot saved to ${snapshotPath}`);
784
				if (extHostSnapshotPath) {
785
					console.log(`  [ext-host] Snapshot saved to ${extHostSnapshotPath}`);
786
				}
787
			}
788
		}
789

790
		// Close ext host inspector now that snapshots (if any) are done
791
		if (extHostInspector) {
792
			extHostInspector.close();
793
		}
794

795
		// Store partial metrics here so we can combine with trace data after close.
796

797
		/** @param {any} r @param {string} name */
798
		function getMetric(r, name) {
799
			const e = r.metrics?.find((/** @type {any} */ m) => m.name === name);
800
			return e ? e.value : 0;
801
		}
802

803
		partialMetrics = {
804
			heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024),
805
			heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024),
806
			heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024),
807
			heapDeltaPostGC: await (async () => {
808
				// Force a full GC then measure heap to get deterministic retained-memory delta.
809
				// --js-flags=--expose-gc is not required: CDP's Runtime.evaluate can call gc()
810
				// when includeCommandLineAPI is true.
811
				try {
812
					await cdp.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
813
					await new Promise(r => setTimeout(r, 200));
814
					const heapPostGC = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
815
					return Math.round((heapPostGC.usedSize - heapBefore.usedSize) / 1024 / 1024);
816
				} catch {
817
					return -1; // gc() not available in this build
818
				}
819
			})(),
820
			layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'),
821
			recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'),
822
			forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'),
823
			frameCount: getMetric(metricsAfter, 'FrameCount') - getMetric(metricsBefore, 'FrameCount'),
824
			compositeLayers: getMetric(metricsAfter, 'CompositeLayers') - getMetric(metricsBefore, 'CompositeLayers'),
825
			paintCount: getMetric(metricsAfter, 'PaintCount') - getMetric(metricsBefore, 'PaintCount'),
826
			responseHasContent: responseInfo.hasContent,
827
			profilePath,
828
			tracePath,
829
			snapshotPath,
830
			extHostHeapUsedBefore,
831
			extHostHeapUsedAfter,
832
			extHostHeapDelta,
833
			extHostHeapDeltaPostGC,
834
			extHostProfilePath,
835
			extHostSnapshotPath,
836
		};
837
	} finally {
838
		if (extHostInspector) {
839
			try { extHostInspector.close(); } catch { }
840
		}
841
		activeVSCode = null;
842
		await vscode.close();
843
	}
844

845
	// Read the trace file written by VS Code on exit via --trace-startup-file
846
	/** @type {Array<any>} */
847
	let traceEvents = [];
848
	try {
849
		const traceData = JSON.parse(fs.readFileSync(tracePath, 'utf-8'));
850
		traceEvents = traceData.traceEvents || [];
851
	} catch {
852
		// Trace file may not exist if VS Code crashed before shutdown
853
	}
854

855
	// Extract code/chat/* perf marks from blink.user_timing trace events.
856
	// These appear as instant ('R' or 'I') events with timestamps in microseconds.
857
	const chatMarks = traceEvents
858
		.filter(e => e.cat === 'blink.user_timing' && e.name && e.name.startsWith('code/chat/'))
859
		.map(e => ({ name: e.name, startTime: e.ts / 1000 }));
860

861
	if (verbose && chatMarks.length > 0) {
862
		console.log(`  [trace] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`);
863
	}
864

865
	// Parse timing — prefer internal code/chat/* marks (precise, in-process)
866
	// with client-side Date.now() as fallback for older builds without marks.
867
	const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated');
868
	const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken');
869
	const timeToFirstToken = internalFirstToken >= 0 ? internalFirstToken : (firstResponseTime - submitTime);
870
	const timeToComplete = responseCompleteTime - submitTime;
871
	const timeToRenderComplete = renderCompleteTime - submitTime;
872
	const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions');
873
	const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke');
874

875
	// Parse GC events from trace.
876
	// Use the trace-event category and phase fields which are stable
877
	// across V8 versions, rather than matching event name substrings.
878
	let majorGCs = 0, minorGCs = 0, gcDurationMs = 0;
879
	for (const event of traceEvents) {
880
		const isGC = event.cat === 'v8.gc'
881
			|| event.cat === 'devtools.timeline,v8'
882
			|| (typeof event.cat === 'string' && event.cat.split(',').some((/** @type {string} */ c) => {
883
				const t = c.trim();
884
				return t === 'v8.gc' || t === 'disabled-by-default-v8.gc' || t === 'disabled-by-default-v8.gc_stats';
885
			}));
886
		if (!isGC) { continue; }
887
		// Only count complete ('X') or duration-begin ('B') events to
888
		// avoid double-counting begin/end pairs.
889
		if (event.ph && event.ph !== 'X' && event.ph !== 'B') { continue; }
890
		const name = event.name || '';
891
		if (/Major|MarkCompact|MSC|MC|IncrementalMarking|FinalizeMC/i.test(name)) { majorGCs++; }
892
		else if (/Minor|Scaveng/i.test(name)) { minorGCs++; }
893
		else { minorGCs++; } // default unknown GC events to minor
894
		if (event.dur) { gcDurationMs += event.dur / 1000; }
895
	}
896
	// Parse Layout duration from devtools.timeline trace events.
897
	let layoutDurationMs = 0;
898
	for (const event of traceEvents) {
899
		if (event.name === 'Layout' && event.ph === 'X' && event.dur) {
900
			layoutDurationMs += event.dur / 1000;
901
		}
902
	}
903

904
	let longTaskCount = 0;
905
	for (const event of traceEvents) {
906
		if (event.name === 'RunTask' && event.dur && event.dur > 50_000) { longTaskCount++; }
907
	}
908

909
	// Parse Long Animation Frame (LoAF) events from devtools.timeline trace.
910
	// AnimationFrame events use async flow pairs (ph:'s' start, ph:'f' finish)
911
	// with matching ids. Compute duration from each s→f pair.
912
	let longAnimationFrameCount = 0;
913
	let longAnimationFrameTotalMs = 0;
914
	{
915
		/** @type {Map<number, number>} */
916
		const frameStarts = new Map();
917
		for (const event of traceEvents) {
918
			if (event.cat === 'devtools.timeline' && event.name === 'AnimationFrame') {
919
				if (event.ph === 's') {
920
					frameStarts.set(event.id, event.ts);
921
				} else if (event.ph === 'f' && frameStarts.has(event.id)) {
922
					const durationMs = (event.ts - /** @type {number} */(frameStarts.get(event.id))) / 1000;
923
					frameStarts.delete(event.id);
924
					if (durationMs > 50) {
925
						longAnimationFrameCount++;
926
						longAnimationFrameTotalMs += durationMs;
927
					}
928
				}
929
			}
930
		}
931
	}
932

933
	return {
934
		...partialMetrics,
935
		timeToUIUpdated, timeToFirstToken, timeToComplete, timeToRenderComplete, instructionCollectionTime, agentInvokeTime,
936
		hasInternalMarks: chatMarks.length > 0,
937
		internalFirstToken,
938
		majorGCs, minorGCs,
939
		gcDurationMs: Math.round(gcDurationMs * 100) / 100,
940
		layoutDurationMs: Math.round(layoutDurationMs * 100) / 100,
941
		longTaskCount,
942
		longAnimationFrameCount,
943
		longAnimationFrameTotalMs: Math.round(longAnimationFrameTotalMs * 100) / 100,
944
	};
945
}
946

947
// -- CI summary generation ---------------------------------------------------
948

949
const GITHUB_REPO = 'https://github.com/microsoft/vscode';
950

951
/**
952
 * Format a build identifier as a Markdown link when possible.
953
 * - Commit SHAs link to the commit page.
954
 * - Semver versions link to the release tag page.
955
 * - Everything else (e.g. "baseline", "dev (local)") is returned as inline code.
956
 * @param {string} label
957
 * @returns {string}
958
 */
959
function formatBuildLink(label) {
960
	if (/^[0-9a-f]{7,40}$/.test(label)) {
961
		const short = label.substring(0, 7);
962
		return `[\`${short}\`](${GITHUB_REPO}/commit/${label})`;
963
	}
964
	if (/^\d+\.\d+\.\d+/.test(label)) {
965
		return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`;
966
	}
967
	return `\`${label}\``;
968
}
969

970
/**
971
 * Build a GitHub compare link between two build identifiers, if both are
972
 * commit-like or version-like references.  Returns empty string otherwise.
973
 * @param {string} base
974
 * @param {string} test
975
 * @returns {string}
976
 */
977
function formatCompareLink(base, test) {
978
	const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v);
979
	if (!isRef(base) || !isRef(test)) {
980
		return '';
981
	}
982
	return `[compare](${GITHUB_REPO}/compare/${base}...${test})`;
983
}
984

985
/**
986
 * Generate a detailed Markdown summary table for CI.
987
 * Printed to stdout and written to ci-summary.md.
988
 *
989
 * @param {Record<string, any>} jsonReport
990
 * @param {Record<string, any> | null} baseline
991
 * @param {{ threshold: number, metricThresholds?: Record<string, number | string>, runs: number, baselineBuild?: string, build?: string }} opts
992
 */
993
function generateCISummary(jsonReport, baseline, opts) {
994
	const baseLabel = opts.baselineBuild || 'baseline';
995
	const testBuildMode = jsonReport.buildMode || 'dev';
996
	const testLabel = testBuildMode === 'dev' ? 'dev (local)'
997
		: testBuildMode === 'production' ? 'production (local)'
998
			: opts.build || testBuildMode;
999
	const baseLink = formatBuildLink(baseLabel);
1000
	const testLink = formatBuildLink(testLabel);
1001
	const compareLink = formatCompareLink(baseLabel, testLabel);
1002
	const allMetrics = [
1003
		['timeToFirstToken', 'timing', 'ms'],
1004
		['timeToComplete', 'timing', 'ms'],
1005
		['layoutCount', 'rendering', ''],
1006
		['recalcStyleCount', 'rendering', ''],
1007
		['forcedReflowCount', 'rendering', ''],
1008
		['longTaskCount', 'rendering', ''],
1009
		['longAnimationFrameCount', 'rendering', ''],
1010
		['longAnimationFrameTotalMs', 'rendering', 'ms'],
1011
		['frameCount', 'rendering', ''],
1012
		['compositeLayers', 'rendering', ''],
1013
		['paintCount', 'rendering', ''],
1014
		['heapDelta', 'memory', 'MB'],
1015
		['heapDeltaPostGC', 'memory', 'MB'],
1016
		['gcDurationMs', 'memory', 'ms'],
1017
		['extHostHeapDelta', 'extHost', 'MB'],
1018
		['extHostHeapDeltaPostGC', 'extHost', 'MB'],
1019
	];
1020
	const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']);
1021

1022
	const lines = [];
1023
	const scenarios = Object.keys(jsonReport.scenarios);
1024

1025
	// -- Collect verdicts per scenario/metric --------------------------------
1026
	/** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]>} */
1027
	const scenarioVerdicts = new Map();
1028
	let totalRegressions = 0;
1029
	let totalImprovements = 0;
1030

1031
	for (const scenario of scenarios) {
1032
		const current = jsonReport.scenarios[scenario];
1033
		const base = baseline?.scenarios?.[scenario];
1034
		/** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */
1035
		const verdicts = [];
1036

1037
		if (base) {
1038
			for (const [metric, group, unit] of allMetrics) {
1039
				const cur = current[group]?.[metric];
1040
				const bas = base[group]?.[metric];
1041
				if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
1042

1043
				const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
1044
				const isRegressionMetric = regressionMetricNames.has(metric);
1045

1046
				const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1047
				const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1048
				const ttest = welchTTest(basRaw, curRaw);
1049
				const pStr = ttest ? `${ttest.pValue}` : 'n/a';
1050

1051
				const metricThreshold = getMetricThreshold(opts, metric);
1052
				const absoluteDelta = cur.median - bas.median;
1053
				let verdict = '';
1054
				if (isRegressionMetric) {
1055
					if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
1056
						if (!ttest || ttest.significant) {
1057
							verdict = 'REGRESSION';
1058
							totalRegressions++;
1059
						} else {
1060
							verdict = 'noise';
1061
						}
1062
					} else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) {
1063
						verdict = 'improved';
1064
						totalImprovements++;
1065
					} else {
1066
						verdict = 'ok';
1067
					}
1068
				} else {
1069
					verdict = 'info';
1070
				}
1071

1072
				const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;
1073
				const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;
1074
				verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr });
1075
			}
1076
		}
1077
		scenarioVerdicts.set(scenario, verdicts);
1078
	}
1079

1080
	// -- Header with verdict up front ----------------------------------------
1081
	const hasRegressions = totalRegressions > 0;
1082
	const verdictIcon = hasRegressions ? '\u274C' : '\u2705';
1083
	const verdictText = hasRegressions
1084
		? `${totalRegressions} regression(s) detected`
1085
		: totalImprovements > 0
1086
			? `No regressions \u2014 ${totalImprovements} improvement(s)`
1087
			: 'No significant changes';
1088

1089
	lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`);
1090
	lines.push('');
1091
	lines.push(`| | |`);
1092
	lines.push(`|---|---|`);
1093
	lines.push(`| **Baseline** | ${baseLink} |`);
1094
	lines.push(`| **Test** | ${testLink} |`);
1095
	if (compareLink) {
1096
		lines.push(`| **Diff** | ${compareLink} |`);
1097
	}
1098
	lines.push(`| **Runs per scenario** | ${opts.runs} |`);
1099
	const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => {
1100
		const parsed = parseMetricThreshold(v);
1101
		return parsed.type !== 'fraction' || parsed.value !== opts.threshold;
1102
	});
1103
	if (overrides.length > 0) {
1104
		const overrideStr = overrides.map(([k, v]) => {
1105
			const parsed = parseMetricThreshold(v);
1106
			return `${k}: ${parsed.type === 'absolute' ? `${parsed.value}${k.includes('Ms') || k.includes('Time') || k.includes('time') ? 'ms' : ''}` : `${(parsed.value * 100).toFixed(0)}%`}`;
1107
		}).join(', ');
1108
		lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`);
1109
	} else {
1110
		lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);
1111
	}
1112
	lines.push(`| **Scenarios** | ${scenarios.length} |`);
1113
	lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`);
1114
	if (jsonReport.buildMode) {
1115
		lines.push(`| **Build mode** | ${jsonReport.buildMode} |`);
1116
	}
1117
	lines.push('');
1118
	if (jsonReport.mismatchedBuildMode) {
1119
		lines.push('> **⚠ Build mode mismatch:** The test and baseline builds use different build modes.');
1120
		lines.push('> Results may not be directly comparable. For apples-to-apples comparisons,');
1121
		lines.push('> use the same build type for both (e.g. `--production-build` with a local');
1122
		lines.push('> baseline path, or two version strings).');
1123
		lines.push('');
1124
	}
1125

1126
	// -- At-a-glance overview table: one row per scenario --------------------
1127
	lines.push(`## Overview`);
1128
	lines.push('');
1129
	lines.push('| Scenario | Description | TTFT | Complete | Layouts | Styles | LoAF | Verdict |');
1130
	lines.push('|----------|-------------|-----:|---------:|--------:|-------:|-----:|:-------:|');
1131

1132
	for (const scenario of scenarios) {
1133
		const verdicts = scenarioVerdicts.get(scenario) || [];
1134
		const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m);
1135

1136
		const ttft = get('timeToFirstToken');
1137
		const complete = get('timeToComplete');
1138
		const layouts = get('layoutCount');
1139
		const styles = get('recalcStyleCount');
1140
		const loaf = get('longAnimationFrameCount');
1141

1142
		const fmtCell = (/** @type {{ change: number, verdict: string } | undefined} */ v) => {
1143
			if (!v) { return '\u2014'; }
1144
			const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`;
1145
			return pct;
1146
		};
1147

1148
		const fmtVerdict = (/** @type {{ verdict: string, change: number }[]} */ vs) => {
1149
			const hasRegression = vs.some(v => v.verdict === 'REGRESSION');
1150
			const hasImproved = vs.some(v => v.verdict === 'improved');
1151
			if (hasRegression) { return '\u274C Regressed'; }
1152
			if (hasImproved) { return '\u2B06\uFE0F Improved'; }
1153
			return '\u2705 OK';
1154
		};
1155

1156
		const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean);
1157
		const rowVerdict = fmtVerdict(/** @type {any[]} */(keyVerdicts));
1158

1159
		lines.push(`| ${scenario} | ${getScenarioDescription(scenario)} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`);
1160
	}
1161
	lines.push('');
1162

1163
	// -- Regressions & improvements detail section ---------------------------
1164
	const hasNotable = [...scenarioVerdicts.values()].some(vs => vs.some(v => v.verdict === 'REGRESSION' || v.verdict === 'improved'));
1165
	if (hasNotable) {
1166
		lines.push('## Regressions & Improvements');
1167
		lines.push('');
1168
		lines.push('Only metrics that regressed or improved significantly are shown below.');
1169
		lines.push('');
1170

1171
		for (const scenario of scenarios) {
1172
			const verdicts = scenarioVerdicts.get(scenario) || [];
1173
			const notable = verdicts.filter(v => v.verdict === 'REGRESSION' || v.verdict === 'improved');
1174
			if (notable.length === 0) { continue; }
1175

1176
			const icon = notable.some(v => v.verdict === 'REGRESSION') ? '\u274C' : '\u2B06\uFE0F';
1177
			lines.push(`### ${icon} ${scenario}`);
1178
			lines.push('');
1179
			lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |');
1180
			lines.push('|--------|----------|------|--------|---------|---------|');
1181
			for (const v of notable) {
1182
				const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
1183
				const verdictIcon = v.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F';
1184
				lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictIcon} ${v.verdict} |`);
1185
			}
1186
			lines.push('');
1187
		}
1188
	}
1189

1190
	// -- Full metric tables in collapsible section ---------------------------
1191
	lines.push('<details><summary>Full metric details per scenario</summary>');
1192
	lines.push('');
1193

1194
	for (const scenario of scenarios) {
1195
		const verdicts = scenarioVerdicts.get(scenario) || [];
1196
		const base = baseline?.scenarios?.[scenario];
1197

1198
		lines.push(`### ${scenario}`);
1199
		lines.push('');
1200

1201
		if (!base) {
1202
			const current = jsonReport.scenarios[scenario];
1203
			lines.push('> No baseline data for this scenario.');
1204
			lines.push('');
1205
			lines.push('| Metric | Value | StdDev | CV | n |');
1206
			lines.push('|--------|------:|-------:|---:|--:|');
1207
			for (const [metric, group, unit] of allMetrics) {
1208
				const cur = current[group]?.[metric];
1209
				if (!cur) { continue; }
1210
				lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`);
1211
			}
1212
			lines.push('');
1213
			continue;
1214
		}
1215

1216
		lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`);
1217
		lines.push(`|--------|----------|------|--------|---------|---------|`);
1218

1219
		for (const v of verdicts) {
1220
			const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
1221
			let verdictDisplay = v.verdict;
1222
			if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; }
1223
			else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; }
1224
			else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; }
1225
			else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; }
1226
			else if (v.verdict === 'info') { verdictDisplay = '\u2139\uFE0F'; }
1227
			lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`);
1228
		}
1229
		lines.push('');
1230
	}
1231
	lines.push('</details>');
1232
	lines.push('');
1233

1234
	// -- Raw run data in collapsible section ---------------------------------
1235
	lines.push('<details><summary>Raw run data</summary>');
1236
	lines.push('');
1237
	for (const scenario of scenarios) {
1238
		const current = jsonReport.scenarios[scenario];
1239
		lines.push(`### ${scenario}`);
1240
		lines.push('');
1241
		lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
1242
		lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
1243
		const runs = current.rawRuns || [];
1244
		for (let i = 0; i < runs.length; i++) {
1245
			const r = runs[i];
1246
			const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
1247
			lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
1248
		}
1249
		lines.push('');
1250
	}
1251
	if (baseline) {
1252
		for (const scenario of scenarios) {
1253
			const base = baseline.scenarios?.[scenario];
1254
			if (!base) { continue; }
1255
			lines.push(`### ${scenario} (baseline)`);
1256
			lines.push('');
1257
			lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
1258
			lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
1259
			const runs = base.rawRuns || [];
1260
			for (let i = 0; i < runs.length; i++) {
1261
				const r = runs[i];
1262
				const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
1263
				lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
1264
			}
1265
			lines.push('');
1266
		}
1267
	}
1268
	lines.push('</details>');
1269
	lines.push('');
1270

1271
	return lines.join('\n');
1272
}
1273

1274
// -- Cleanup on SIGINT/SIGTERM -----------------------------------------------
1275

1276
/** @type {{ close: () => Promise<void> } | null} */
1277
let activeVSCode = null;
1278
/** @type {{ close: () => Promise<void> } | null} */
1279
let activeMockServer = null;
1280

1281
function installSignalHandlers() {
1282
	const cleanup = async () => {
1283
		console.log('\n[chat-simulation] Caught interrupt, cleaning up...');
1284
		try { await activeVSCode?.close(); } catch { }
1285
		try { await activeMockServer?.close(); } catch { }
1286
		process.exit(130);
1287
	};
1288
	process.on('SIGINT', cleanup);
1289
	process.on('SIGTERM', cleanup);
1290
}
1291

1292
// -- Diagnostic cleanup ------------------------------------------------------
1293

1294
/**
1295
 * Remove large diagnostic files (heap snapshots, CPU profiles, traces) from
1296
 * a run's metrics to free disk space.  Keeps the JSON results data intact.
1297
 * @param {RunMetrics} metrics
1298
 */
1299
function cleanupRunDiagnostics(metrics) {
1300
	const filesToDelete = [
1301
		metrics.profilePath,
1302
		metrics.tracePath,
1303
		metrics.snapshotPath,
1304
		metrics.extHostProfilePath,
1305
		metrics.extHostSnapshotPath,
1306
	];
1307
	for (const filePath of filesToDelete) {
1308
		if (filePath && fs.existsSync(filePath)) {
1309
			try {
1310
				fs.rmSync(filePath, { force: true });
1311
			} catch {
1312
				// Ignore cleanup errors
1313
			}
1314
		}
1315
	}
1316
}
1317

1318
/**
1319
 * Clean up diagnostics for all scenarios that did NOT regress.
1320
 * Keeps diagnostics for regressed scenarios so they can be investigated.
1321
 * @param {Record<string, RunMetrics[]>} allResults - test results by scenario
1322
 * @param {Set<string>} regressedScenarios - scenarios that regressed
1323
 */
1324
function cleanupNonRegressedDiagnostics(allResults, regressedScenarios) {
1325
	for (const [scenario, runs] of Object.entries(allResults)) {
1326
		if (regressedScenarios.has(scenario)) {
1327
			continue;
1328
		}
1329
		for (const metrics of runs) {
1330
			cleanupRunDiagnostics(metrics);
1331
		}
1332
	}
1333
}
1334

1335
// -- Main --------------------------------------------------------------------
1336

1337
async function main() {
1338
	registerPerfScenarios();
1339
	const opts = parseArgs();
1340

1341
	installSignalHandlers();
1342

1343
	const { startServer } = require('./common/mock-llm-server');
1344
	const mockServer = await startServer(0);
1345
	activeMockServer = mockServer;
1346
	console.log(`[chat-simulation] Mock LLM server: ${mockServer.url}`);
1347

1348
	// -- Resume mode --------------------------------------------------------
1349
	if (opts.resume) {
1350
		if (!fs.existsSync(opts.resume)) {
1351
			console.error(`[chat-simulation] Resume file not found: ${opts.resume}`);
1352
			process.exit(1);
1353
		}
1354
		const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8'));
1355
		const prevDir = path.dirname(opts.resume);
1356

1357
		// Find the associated baseline JSON in the same directory
1358
		const baselineFiles = fs.readdirSync(prevDir).filter((/** @type {string} */ f) => f.startsWith('baseline-') && f.endsWith('.json'));
1359
		const baselineFile = baselineFiles.length > 0 ? path.join(prevDir, baselineFiles[0]) : null;
1360
		const prevBaseline = baselineFile ? JSON.parse(fs.readFileSync(baselineFile, 'utf-8')) : null;
1361

1362
		// Determine which scenarios to resume (default: all from previous run)
1363
		const resumeScenarios = opts.scenarios.length > 0
1364
			? opts.scenarios.filter(s => prevResults.scenarios?.[s])
1365
			: Object.keys(prevResults.scenarios || {});
1366

1367
		if (resumeScenarios.length === 0) {
1368
			console.error('[chat-simulation] No matching scenarios found in previous results');
1369
			process.exit(1);
1370
		}
1371

1372
		const testElectron = await resolveBuild(opts.build);
1373
		const baselineVersion = prevBaseline?.baselineBuildVersion;
1374
		const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null;
1375

1376
		const runsToAdd = opts.runs;
1377
		console.log(`[chat-simulation] Resuming from: ${opts.resume}`);
1378
		console.log(`[chat-simulation] Adding ${runsToAdd} runs per scenario`);
1379
		console.log(`[chat-simulation] Scenarios: ${resumeScenarios.join(', ')}`);
1380
		if (prevBaseline) {
1381
			console.log(`[chat-simulation] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`);
1382
		}
1383
		console.log('');
1384

1385
		for (const scenario of resumeScenarios) {
1386
			console.log(`[chat-simulation] === Resuming: ${scenario} ===`);
1387
			const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || [];
1388
			const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || [];
1389

1390
			// Run additional test iterations
1391
			console.log(`[chat-simulation]   Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`);
1392
			for (let i = 0; i < runsToAdd; i++) {
1393
				const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`;
1394
				console.log(`[chat-simulation]     Run ${i + 1}/${runsToAdd}...`);
1395
				try {
1396
					const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test', { ...opts.settingsOverrides, ...opts.testSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
1397
					// Clean up previous run's diagnostics to bound disk usage; keep the latest
1398
					if (opts.cleanupDiagnostics && prevTestRuns.length > 0) { cleanupRunDiagnostics(prevTestRuns[prevTestRuns.length - 1]); }
1399
					prevTestRuns.push(m);
1400
					if (opts.verbose) {
1401
						const src = m.hasInternalMarks ? 'internal' : 'client-side';
1402
						console.log(`      [${src}] firstToken=${m.timeToFirstToken}ms, complete=${m.timeToComplete}ms`);
1403
					}
1404
				} catch (err) { console.error(`      Run ${i + 1} failed: ${err}`); }
1405
			}
1406

1407
			// Run additional baseline iterations
1408
			if (baselineElectron && prevBaseline?.scenarios?.[scenario]) {
1409
				console.log(`[chat-simulation]   Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`);
1410
				for (let i = 0; i < runsToAdd; i++) {
1411
					const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`;
1412
					console.log(`[chat-simulation]     Run ${i + 1}/${runsToAdd}...`);
1413
					try {
1414
						const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline', { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
1415
						// Clean up previous run's diagnostics to bound disk usage; keep the latest
1416
						if (opts.cleanupDiagnostics && prevBaseRuns.length > 0) { cleanupRunDiagnostics(prevBaseRuns[prevBaseRuns.length - 1]); }
1417
						prevBaseRuns.push(m);
1418
					} catch (err) { console.error(`      Run ${i + 1} failed: ${err}`); }
1419
				}
1420
			}
1421

1422
			// Recompute stats with merged data
1423
			const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevTestRuns });
1424
			for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); }
1425
			prevResults.scenarios[scenario] = sd;
1426

1427
			if (prevBaseline?.scenarios?.[scenario]) {
1428
				const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevBaseRuns });
1429
				for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); }
1430
				prevBaseline.scenarios[scenario] = bsd;
1431
			}
1432
			console.log(`[chat-simulation]   Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`);
1433
			console.log('');
1434
		}
1435

1436
		// Write updated files back
1437
		prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs));
1438
		prevResults.lastResumed = new Date().toISOString();
1439
		fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2));
1440
		console.log(`[chat-simulation] Updated results: ${opts.resume}`);
1441

1442
		if (prevBaseline && baselineFile) {
1443
			prevBaseline.lastResumed = new Date().toISOString();
1444
			fs.writeFileSync(baselineFile, JSON.stringify(prevBaseline, null, 2));
1445
			// Also update cached baseline
1446
			const cachedPath = path.join(DATA_DIR, path.basename(baselineFile));
1447
			fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2));
1448
			console.log(`[chat-simulation] Updated baseline: ${baselineFile}`);
1449
		}
1450

1451
		// -- Re-run comparison with merged data --------------------------------
1452
		opts.baseline = baselineFile || undefined;
1453
		const jsonReport = prevResults;
1454
		jsonReport._resultsPath = opts.resume;
1455

1456
		// Fall through to comparison logic below
1457
		await printComparison(jsonReport, opts);
1458
		await mockServer.close();
1459
		return;
1460
	}
1461

1462
	// -- Normal (non-resume) flow -------------------------------------------
1463
	// --production-build: build a local bundled (non-dev) package from the
1464
	// current source tree using `gulp vscode`.  This produces the same
1465
	// packaging as a release build (bundled JS, no VSCODE_DEV) while still
1466
	// testing your local changes.
1467
	if (opts.productionBuild && !opts.build) {
1468
		const prodBuildPath = buildProductionBuild();
1469
		opts.build = prodBuildPath;
1470
		console.log(`[chat-simulation] --production-build: using local production build at ${prodBuildPath}`);
1471
	}
1472

1473
	const electronPath = await resolveBuild(opts.build);
1474

1475
	if (!fs.existsSync(electronPath)) {
1476
		console.error(`Electron not found at: ${electronPath}`);
1477
		console.error('Run "node build/lib/preLaunch.ts" first, or pass --build <path>');
1478
		process.exit(1);
1479
	}
1480

1481
	// Detect build modes for both test and baseline builds
1482
	const testBuildMode = detectBuildMode(electronPath);
1483

1484
	// Resolve the baseline build path early so we can detect its mode.
1485
	// For version strings this downloads; for local paths it resolves directly.
1486
	const isBaselineVersionString = opts.baselineBuild && isVersionString(opts.baselineBuild);
1487
	const isBaselineLocalPath = opts.baselineBuild && !isBaselineVersionString;
1488
	/** @type {string | undefined} */
1489
	let baselineElectronPath;
1490
	if (isBaselineLocalPath) {
1491
		baselineElectronPath = await resolveBuild(opts.baselineBuild);
1492
		if (!fs.existsSync(baselineElectronPath)) {
1493
			console.error(`Baseline build not found at: ${baselineElectronPath}`);
1494
			process.exit(1);
1495
		}
1496
	}
1497
	const baselineBuildMode = opts.baselineBuild
1498
		? (isBaselineVersionString ? 'release' : detectBuildMode(baselineElectronPath || ''))
1499
		: undefined;
1500

1501
	const isMismatchedBuildMode = baselineBuildMode !== undefined && testBuildMode !== baselineBuildMode;
1502

1503
	// Create a timestamped run directory for all output
1504
	const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
1505
	const runDir = path.join(DATA_DIR, runTimestamp);
1506
	fs.mkdirSync(runDir, { recursive: true });
1507
	console.log(`[chat-simulation] Output: ${runDir}`);
1508

1509
	// Compute effective settings per role
1510
	const testSettings = { ...opts.settingsOverrides, ...opts.testSettingsOverrides };
1511
	const baselineSettings = { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides };
1512

1513
	// -- Baseline build --------------------------------------------------
1514
	if (opts.baselineBuild) {
1515
		// Use a sanitized label for file names — replace path separators for local paths
1516
		const baselineLabel = isBaselineLocalPath
1517
			? path.basename(path.resolve(opts.baselineBuild))
1518
			: opts.baselineBuild;
1519
		const baselineJsonPath = path.join(runDir, `baseline-${baselineLabel}.json`);
1520

1521
		// Local paths: always run fresh (no caching — the build may have changed)
1522
		// Version strings: use caching as before
1523
		const cachedPath = isBaselineLocalPath ? null : path.join(DATA_DIR, `baseline-${baselineLabel}.json`);
1524
		const cachedBaseline = cachedPath && !opts.noCache && fs.existsSync(cachedPath)
1525
			? JSON.parse(fs.readFileSync(cachedPath, 'utf-8'))
1526
			: null;
1527

1528
		if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) {
1529
			// Check if the cache covers all requested scenarios
1530
			const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {}));
1531
			const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s));
1532

1533
			// Also check if cached scenarios have fewer runs than requested
1534
			const shortScenarios = opts.scenarios.filter((/** @type {string} */ s) => {
1535
				const cached = cachedBaseline.scenarios?.[s];
1536
				return cached && (cached.rawRuns?.length || 0) < opts.runs;
1537
			});
1538

1539
			if (missingScenarios.length === 0 && shortScenarios.length === 0) {
1540
				console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`);
1541
				fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
1542
				opts.baseline = baselineJsonPath;
1543
			} else {
1544
				const scenariosToRun = [...new Set([...missingScenarios, ...shortScenarios])];
1545
				if (missingScenarios.length > 0) {
1546
					console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`);
1547
				}
1548
				if (shortScenarios.length > 0) {
1549
					console.log(`[chat-simulation] Cached baseline needs more runs for: ${shortScenarios.map((/** @type {string} */ s) => `${s} (${cachedBaseline.scenarios[s].rawRuns?.length || 0}/${opts.runs})`).join(', ')}`);
1550
				}
1551
				console.log(`[chat-simulation] Running baseline for ${scenariosToRun.length} scenario(s)...`);
1552
				const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
1553
				for (const scenario of scenariosToRun) {
1554
					const existingRuns = cachedBaseline.scenarios?.[scenario]?.rawRuns || [];
1555
					const runsNeeded = opts.runs - existingRuns.length;
1556
					/** @type {RunMetrics[]} */
1557
					const newResults = [];
1558
					for (let i = 0; i < runsNeeded; i++) {
1559
						try {
1560
							const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${existingRuns.length + i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
1561
							// Clean up previous run's diagnostics to bound disk usage; keep the latest
1562
							if (opts.cleanupDiagnostics && newResults.length > 0) { cleanupRunDiagnostics(newResults[newResults.length - 1]); }
1563
							newResults.push(m);
1564
						}
1565
						catch (err) { console.error(`[chat-simulation]   Baseline run ${i + 1} failed: ${err}`); }
1566
					}
1567
					const allRuns = [...existingRuns, ...newResults];
1568
					if (allRuns.length > 0) {
1569
						const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: allRuns });
1570
						for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); }
1571
						cachedBaseline.scenarios[scenario] = sd;
1572
					}
1573
				}
1574
				cachedBaseline.runsPerScenario = opts.runs;
1575
				fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
1576
				if (cachedPath) {
1577
					fs.writeFileSync(cachedPath, JSON.stringify(cachedBaseline, null, 2));
1578
				}
1579
				opts.baseline = baselineJsonPath;
1580
			}
1581
		} else {
1582
			const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
1583
			console.log(`[chat-simulation] Benchmarking baseline build (${baselineLabel})...`);
1584
			/** @type {Record<string, RunMetrics[]>} */
1585
			const baselineResults = {};
1586
			for (const scenario of opts.scenarios) {
1587
				/** @type {RunMetrics[]} */
1588
				const results = [];
1589
				for (let i = 0; i < opts.runs; i++) {
1590
					try {
1591
						const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
1592
						// Clean up previous run's diagnostics to bound disk usage; keep the latest
1593
						if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
1594
						results.push(m);
1595
					}
1596
					catch (err) { console.error(`[chat-simulation]   Baseline run ${i + 1} failed: ${err}`); }
1597
				}
1598
				if (results.length > 0) { baselineResults[scenario] = results; }
1599
			}
1600
			const baselineReport = {
1601
				timestamp: new Date().toISOString(),
1602
				baselineBuildVersion: opts.baselineBuild,
1603
				platform: process.platform,
1604
				runsPerScenario: opts.runs,
1605
				scenarios: /** @type {Record<string, any>} */ ({}),
1606
			};
1607
			for (const [scenario, results] of Object.entries(baselineResults)) {
1608
				const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
1609
				for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
1610
				baselineReport.scenarios[scenario] = sd;
1611
			}
1612
			fs.writeFileSync(baselineJsonPath, JSON.stringify(baselineReport, null, 2));
1613
			// Cache at the top level for reuse across runs (version strings only)
1614
			if (cachedPath) {
1615
				fs.writeFileSync(cachedPath, JSON.stringify(baselineReport, null, 2));
1616
			}
1617
			opts.baseline = baselineJsonPath;
1618
		}
1619
		console.log('');
1620
	}
1621

1622
	// -- Run benchmarks --------------------------------------------------
1623
	console.log(`[chat-simulation] Electron: ${electronPath}`);
1624
	console.log(`[chat-simulation] Build mode: ${buildModeLabel(testBuildMode)}`);
1625
	if (baselineBuildMode) {
1626
		console.log(`[chat-simulation] Baseline mode: ${buildModeLabel(baselineBuildMode)}`);
1627
	}
1628
	console.log(`[chat-simulation] Runs per scenario: ${opts.runs}`);
1629
	console.log(`[chat-simulation] Scenarios: ${opts.scenarios.join(', ')}`);
1630
	if (Object.keys(opts.settingsOverrides).length > 0) {
1631
		console.log(`[chat-simulation] Settings overrides (all): ${JSON.stringify(opts.settingsOverrides)}`);
1632
	}
1633
	if (Object.keys(opts.testSettingsOverrides).length > 0) {
1634
		console.log(`[chat-simulation] Settings overrides (test): ${JSON.stringify(opts.testSettingsOverrides)}`);
1635
	}
1636
	if (Object.keys(opts.baselineSettingsOverrides).length > 0) {
1637
		console.log(`[chat-simulation] Settings overrides (baseline): ${JSON.stringify(opts.baselineSettingsOverrides)}`);
1638
	}
1639

1640
	if (isMismatchedBuildMode) {
1641
		console.log('');
1642
		console.log(`[chat-simulation] ⚠ WARNING: Build mode mismatch — test is ${testBuildMode}, baseline is ${baselineBuildMode}.`);
1643
		console.log('[chat-simulation]   Results may not be directly comparable. For apples-to-apples');
1644
		console.log('[chat-simulation]   comparisons, use the same build type for both.');
1645
		if (testBuildMode === 'dev') {
1646
			console.log('[chat-simulation]   To use a local production build instead:');
1647
			console.log('[chat-simulation]     npm run perf:chat -- --production-build');
1648
		}
1649
		if (!opts.ci && !opts.force) {
1650
			const readline = require('readline');
1651
			const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1652
			const answer = await new Promise(resolve => rl.question('[chat-simulation] Continue anyway? [y/N] ', resolve));
1653
			rl.close();
1654
			if (String(answer).toLowerCase() !== 'y') {
1655
				console.log('[chat-simulation] Aborted.');
1656
				await mockServer.close();
1657
				process.exit(0);
1658
			}
1659
		}
1660
	}
1661
	console.log('');
1662

1663
	/** @type {Record<string, RunMetrics[]>} */
1664
	const allResults = {};
1665
	let anyFailed = false;
1666

1667
	for (const scenario of opts.scenarios) {
1668
		console.log(`[chat-simulation] === Scenario: ${scenario} ===`);
1669
		/** @type {RunMetrics[]} */
1670
		const results = [];
1671
		for (let i = 0; i < opts.runs; i++) {
1672
			console.log(`[chat-simulation]   Run ${i + 1}/${opts.runs}...`);
1673
			try {
1674
				const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test', testSettings, { heapSnapshots: opts.heapSnapshots });
1675
				// Clean up previous run's diagnostics to bound disk usage; keep the latest
1676
				if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
1677
				results.push(metrics);
1678
				if (opts.verbose) {
1679
					const src = metrics.hasInternalMarks ? 'internal' : 'client-side';
1680
					console.log(`    [${src}] firstToken=${metrics.timeToFirstToken}ms, complete=${metrics.timeToComplete}ms, heap=delta${metrics.heapDelta}MB, longTasks=${metrics.longTaskCount}${metrics.hasInternalMarks ? `, internalTTFT=${metrics.internalFirstToken}ms` : ''}`);
1681
				}
1682
			} catch (err) { console.error(`    Run ${i + 1} failed: ${err}`); }
1683
		}
1684
		if (results.length === 0) { console.error(`[chat-simulation]   All runs failed for scenario: ${scenario}`); anyFailed = true; }
1685
		else { allResults[scenario] = results; }
1686
		console.log('');
1687
	}
1688

1689
	// -- Summary ---------------------------------------------------------
1690
	console.log('[chat-simulation] ======================= Summary =======================');
1691
	for (const [scenario, results] of Object.entries(allResults)) {
1692
		console.log('');
1693
		console.log(`  -- ${scenario} (${results.length} runs) --`);
1694
		console.log('');
1695
		console.log('  Timing:');
1696
		console.log(summarize(results.map(r => r.timeToFirstToken), '  Request → First token ', 'ms'));
1697
		console.log(summarize(results.map(r => r.timeToComplete), '  Request → Complete    ', 'ms'));
1698
		console.log(summarize(results.map(r => r.timeToRenderComplete), '  Request → Rendered    ', 'ms'));
1699
		console.log('');
1700
		console.log('  Rendering:');
1701
		console.log(summarize(results.map(r => r.layoutCount), '  Layouts               ', ''));
1702
		console.log(summarize(results.map(r => r.layoutDurationMs), '  Layout duration       ', 'ms'));
1703
		console.log(summarize(results.map(r => r.recalcStyleCount), '  Style recalcs         ', ''));
1704
		console.log(summarize(results.map(r => r.forcedReflowCount), '  Forced reflows        ', ''));
1705
		console.log(summarize(results.map(r => r.longTaskCount), '  Long tasks (>50ms)    ', ''));
1706
		console.log(summarize(results.map(r => r.longAnimationFrameCount), '  Long anim. frames     ', ''));
1707
		console.log(summarize(results.map(r => r.longAnimationFrameTotalMs), '  LoAF total duration   ', 'ms'));
1708
		console.log(summarize(results.map(r => r.frameCount), '  Frames                ', ''));
1709
		console.log(summarize(results.map(r => r.compositeLayers), '  Composite layers      ', ''));
1710
		console.log(summarize(results.map(r => r.paintCount), '  Paints                ', ''));
1711
		console.log('');
1712
		console.log('  Memory:');
1713
		console.log(summarize(results.map(r => r.heapDelta), '  Heap delta            ', 'MB'));
1714
		console.log(summarize(results.map(r => r.heapDeltaPostGC), '  Heap delta (post-GC)  ', 'MB'));
1715
		console.log(summarize(results.map(r => r.gcDurationMs), '  GC duration           ', 'ms'));
1716
		if (results.some(r => r.extHostHeapDelta >= 0)) {
1717
			console.log('');
1718
			console.log('  Extension Host:');
1719
			console.log(summarize(results.map(r => r.extHostHeapUsedBefore), '  Heap before           ', 'MB'));
1720
			console.log(summarize(results.map(r => r.extHostHeapUsedAfter), '  Heap after            ', 'MB'));
1721
			console.log(summarize(results.map(r => r.extHostHeapDelta), '  Heap delta            ', 'MB'));
1722
			console.log(summarize(results.map(r => r.extHostHeapDeltaPostGC), '  Heap delta (post-GC)  ', 'MB'));
1723
		}
1724
	}
1725

1726
	// -- JSON output -----------------------------------------------------
1727
	const jsonPath = path.join(runDir, 'results.json');
1728
	const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, buildMode: string, mismatchedBuildMode: boolean, scenarios: Record<string, any>, _resultsPath?: string }} */ ({
1729
		timestamp: new Date().toISOString(),
1730
		platform: process.platform,
1731
		runsPerScenario: opts.runs,
1732
		buildMode: testBuildMode,
1733
		mismatchedBuildMode: !!isMismatchedBuildMode,
1734
		scenarios: /** @type {Record<string, any>} */ ({}),
1735
	});
1736
	for (const [scenario, results] of Object.entries(allResults)) {
1737
		const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
1738
		for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
1739
		jsonReport.scenarios[scenario] = sd;
1740
	}
1741
	fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2));
1742
	jsonReport._resultsPath = jsonPath;
1743
	console.log('');
1744
	console.log(`[chat-simulation] Results written to ${jsonPath}`);
1745

1746
	// -- Save baseline ---------------------------------------------------
1747
	if (opts.saveBaseline) {
1748
		if (!opts.baseline) { console.error('[chat-simulation] --save-baseline requires --baseline <path>'); process.exit(1); }
1749
		fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2));
1750
		console.log(`[chat-simulation] Baseline saved to ${opts.baseline}`);
1751
	}
1752

1753
	// -- Baseline comparison ---------------------------------------------
1754
	const regressedScenarios = await printComparison(jsonReport, opts);
1755

1756
	// Clean up diagnostics for scenarios that did not regress
1757
	if (opts.cleanupDiagnostics) {
1758
		cleanupNonRegressedDiagnostics(allResults, regressedScenarios);
1759
	}
1760

1761
	if (anyFailed) { process.exit(1); }
1762
	await mockServer.close();
1763
}
1764

1765
/**
1766
 * Print baseline comparison and exit with code 1 if regressions found.
1767
 * Returns the set of scenario IDs that regressed.
1768
 * @param {Record<string, any>} jsonReport
1769
 * @param {{ threshold: number, metricThresholds?: Record<string, number | string>, baseline?: string, ci?: boolean, resume?: string, build?: string, baselineBuild?: string, runs: number, cleanupDiagnostics?: boolean }} opts
1770
 * @returns {Promise<Set<string>>}
1771
 */
1772
async function printComparison(jsonReport, opts) {
1773
	let regressionFound = false;
1774
	let inconclusiveFound = false;
1775
	/** @type {Set<string>} */
1776
	const regressedScenarios = new Set();
1777
	if (opts.baseline && fs.existsSync(opts.baseline)) {
1778
		const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'));
1779
		console.log('');
1780
		console.log(`[chat-simulation] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`);
1781
		console.log(`[chat-simulation] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`);
1782
		if (jsonReport.mismatchedBuildMode) {
1783
			console.log(`[chat-simulation] ⚠ Note: build mode mismatch — test is ${jsonReport.buildMode}, baseline differs.`);
1784
			console.log('[chat-simulation]   Results may not be directly comparable.');
1785
		}
1786
		console.log('');
1787

1788
		// Metrics that trigger regression failure when they exceed the threshold
1789
		const regressionMetrics = [
1790
			// [metric, group, unit]
1791
			['timeToFirstToken', 'timing', 'ms'],
1792
			['timeToComplete', 'timing', 'ms'],
1793
			['layoutCount', 'rendering', ''],
1794
			['recalcStyleCount', 'rendering', ''],
1795
			['forcedReflowCount', 'rendering', ''],
1796
			['longTaskCount', 'rendering', ''],
1797
		];
1798
		// Informational metrics — shown in comparison but don't trigger failure
1799
		const infoMetrics = [
1800
			['heapDelta', 'memory', 'MB'],
1801
			['gcDurationMs', 'memory', 'ms'],
1802
			['extHostHeapDelta', 'extHost', 'MB'],
1803
			['extHostHeapDeltaPostGC', 'extHost', 'MB'],
1804
		];
1805

1806
		for (const scenario of Object.keys(jsonReport.scenarios)) {
1807
			const current = jsonReport.scenarios[scenario];
1808
			const base = baseline.scenarios?.[scenario];
1809
			if (!base) { console.log(`  ${scenario}: (no baseline)`); continue; }
1810

1811
			/** @type {string[]} */
1812
			const diffs = [];
1813
			let scenarioRegression = false;
1814

1815
			for (const [metric, group, unit] of regressionMetrics) {
1816
				const cur = current[group]?.[metric];
1817
				const bas = base[group]?.[metric];
1818
				if (!cur || !bas || !bas.median) { continue; }
1819
				const change = (cur.median - bas.median) / bas.median;
1820
				const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
1821

1822
				// Statistical significance via Welch's t-test on raw run values
1823
				const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1824
				const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1825
				const ttest = welchTTest(basRaw, curRaw);
1826

1827
				const metricThreshold = getMetricThreshold(opts, metric);
1828
				const absoluteDelta = cur.median - bas.median;
1829
				let flag = '';
1830
				if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
1831
					if (!ttest) {
1832
						flag = ' ← possible regression (n too small for significance test)';
1833
						inconclusiveFound = true;
1834
					} else if (ttest.significant) {
1835
						flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`;
1836
						scenarioRegression = true;
1837
						regressionFound = true;
1838
					} else {
1839
						flag = ` (likely noise — p=${ttest.pValue}, not significant)`;
1840
						inconclusiveFound = true;
1841
					}
1842
				} else if (ttest && change > 0 && ttest.significant && ttest.confidence === 'high') {
1843
					flag = ` (significant increase, p=${ttest.pValue})`;
1844
				}
1845
				diffs.push(`    ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct})${flag}`);
1846
			}
1847
			for (const [metric, group, unit] of infoMetrics) {
1848
				const cur = current[group]?.[metric];
1849
				const bas = base[group]?.[metric];
1850
				if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
1851
				const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
1852
				const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
1853
				diffs.push(`    ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct}) [info]`);
1854
			}
1855
			console.log(`  ${scenario}: ${scenarioRegression ? 'FAIL' : 'OK'}`);
1856
			if (scenarioRegression) { regressedScenarios.add(scenario); }
1857
			diffs.forEach(d => console.log(d));
1858
		}
1859

1860
		console.log('');
1861
		console.log(regressionFound
1862
			? `[chat-simulation] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance`
1863
			: `[chat-simulation] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`);
1864

1865
		if (inconclusiveFound && !regressionFound) {
1866
			// Find the results.json path to suggest in the hint
1867
			const resultsPath = Object.keys(jsonReport.scenarios).length > 0
1868
				? (jsonReport._resultsPath || opts.resume || 'path/to/results.json')
1869
				: 'path/to/results.json';
1870
			// Estimate required runs from the observed effect size and variance
1871
			// using power analysis for Welch's t-test (alpha=0.05, 80% power).
1872
			// n_per_group = 2 * ((z_alpha/2 + z_beta) / d)^2 where d = Cohen's d
1873
			let maxNeeded = 0;
1874
			for (const scenario of Object.keys(jsonReport.scenarios)) {
1875
				const current = jsonReport.scenarios[scenario];
1876
				const base = baseline.scenarios?.[scenario];
1877
				if (!base) { continue; }
1878
				for (const [metric, group] of [['timeToFirstToken', 'timing'], ['timeToComplete', 'timing'], ['layoutCount', 'rendering'], ['recalcStyleCount', 'rendering']]) {
1879
					const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1880
					const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1881
					if (curRaw.length < 2 || basRaw.length < 2) { continue; }
1882
					const meanA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / basRaw.length;
1883
					const meanB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / curRaw.length;
1884
					const varA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanA) ** 2, 0) / (basRaw.length - 1);
1885
					const varB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanB) ** 2, 0) / (curRaw.length - 1);
1886
					const pooledSD = Math.sqrt((varA + varB) / 2);
1887
					if (pooledSD === 0) { continue; }
1888
					const d = Math.abs(meanB - meanA) / pooledSD;
1889
					if (d === 0) { continue; }
1890
					// z_0.025 = 1.96, z_0.2 = 0.842
1891
					const nPerGroup = Math.ceil(2 * ((1.96 + 0.842) / d) ** 2);
1892
					const currentN = Math.min(curRaw.length, basRaw.length);
1893
					maxNeeded = Math.max(maxNeeded, nPerGroup - currentN);
1894
				}
1895
			}
1896
			const suggestedRuns = Math.max(1, Math.min(maxNeeded, 20));
1897
			console.log('');
1898
			console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.');
1899
			console.log('[chat-simulation] To increase confidence, add more runs with --resume:');
1900
			console.log(`[chat-simulation]   npm run perf:chat -- --resume ${resultsPath} --runs ${suggestedRuns}`);
1901
		}
1902
	}
1903

1904
	// -- CI summary ------------------------------------------------------
1905
	if (opts.ci) {
1906
		const ciBaseline = opts.baseline && fs.existsSync(opts.baseline)
1907
			? JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'))
1908
			: null;
1909
		const summary = generateCISummary(jsonReport, ciBaseline, {
1910
			threshold: opts.threshold,
1911
			metricThresholds: opts.metricThresholds,
1912
			runs: jsonReport.runsPerScenario || opts.runs,
1913
			baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild,
1914
			build: opts.build,
1915
		});
1916

1917
		// Write to file for GitHub Actions $GITHUB_STEP_SUMMARY
1918
		const summaryPath = path.join(DATA_DIR, 'ci-summary.md');
1919
		fs.writeFileSync(summaryPath, summary);
1920
		console.log(`[chat-simulation] CI summary written to ${summaryPath}`);
1921

1922
		// Also print the full summary table to stdout
1923
		console.log('');
1924
		console.log('==================================================================');
1925
		console.log('               CHAT PERF COMPARISON RESULTS                       ');
1926
		console.log('==================================================================');
1927
		console.log('');
1928
		console.log(summary);
1929
	}
1930

1931
	if (regressionFound) { process.exit(1); }
1932
	return regressedScenarios;
1933
}
1934

1935
main().catch(err => { console.error(err); process.exit(1); });
1936

1937