Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/scripts/chat-simulation/test-chat-perf-regression.js
13379 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
// @ts-check
7
8
/**
9
* Chat performance benchmark.
10
*
11
* Uses the real copilot extension with IS_SCENARIO_AUTOMATION=1 and a local
12
* mock LLM server. Measures the full stack: prompt building, context
13
* gathering, tool resolution, rendering, GC, and layout overhead.
14
*
15
* Usage:
16
* npm run perf:chat # all scenarios vs 1.115.0
17
* npm run perf:chat -- --runs 10 # 10 runs per scenario
18
* npm run perf:chat -- --scenario text-only # single scenario
19
* npm run perf:chat -- --no-baseline # skip baseline comparison
20
* npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0
21
* npm run perf:chat -- --resume .chat-simulation-data/2026-04-14/results.json --runs 3
22
*/
23
24
const path = require('path');
25
const fs = require('fs');
26
const {
27
ROOT, DATA_DIR, METRIC_DEFS, loadConfig,
28
resolveBuild, isVersionString, buildEnv, buildArgs, prepareRunDir,
29
robustStats, welchTTest, summarize, markDuration, launchVSCode,
30
getNextExtHostInspectPort, connectToExtHostInspector, getRepoRoot,
31
} = require('./common/utils');
32
const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server');
33
const { registerPerfScenarios, getScenarioDescription } = require('./common/perf-scenarios');
34
35
// -- Config (edit config.jsonc to change defaults) ---------------------------
36
37
const CONFIG = loadConfig('perfRegression');
38
39
// -- CLI args ----------------------------------------------------------------
40
41
function parseArgs() {
42
const args = process.argv.slice(2);
43
const opts = {
44
runs: CONFIG.runsPerScenario ?? 5,
45
verbose: false,
46
ci: false,
47
noCache: false,
48
force: false,
49
heapSnapshots: false,
50
/** @type {string[]} */
51
scenarios: [],
52
/** @type {string | undefined} */
53
build: undefined,
54
/** @type {string | undefined} */
55
baseline: undefined,
56
/** @type {string | undefined} */
57
baselineBuild: CONFIG.baselineBuild ?? '1.115.0',
58
saveBaseline: false,
59
threshold: CONFIG.regressionThreshold ?? 0.2,
60
/** @type {Record<string, number | string>} */
61
metricThresholds: CONFIG.metricThresholds ?? {},
62
/** @type {string | undefined} */
63
resume: undefined,
64
productionBuild: false,
65
/** @type {Record<string, any>} */
66
settingsOverrides: {},
67
/** @type {Record<string, any>} */
68
testSettingsOverrides: {},
69
/** @type {Record<string, any>} */
70
baselineSettingsOverrides: {},
71
cleanupDiagnostics: false,
72
};
73
for (let i = 0; i < args.length; i++) {
74
switch (args[i]) {
75
case '--runs': opts.runs = parseInt(args[++i], 10); break;
76
case '--verbose': opts.verbose = true; break;
77
case '--scenario': case '-s': opts.scenarios.push(args[++i]); break;
78
case '--build': case '-b': opts.build = args[++i]; break;
79
case '--baseline': opts.baseline = args[++i]; break;
80
case '--baseline-build': opts.baselineBuild = args[++i]; break;
81
case '--no-baseline': opts.baselineBuild = undefined; break;
82
case '--save-baseline': opts.saveBaseline = true; break;
83
case '--threshold': opts.threshold = parseFloat(args[++i]); break;
84
case '--resume': opts.resume = args[++i]; break;
85
case '--production-build': opts.productionBuild = true; break;
86
case '--setting': case '--test-setting': case '--baseline-setting': {
87
const kv = args[++i];
88
const eq = kv.indexOf('=');
89
if (eq === -1) { console.error(`${args[i - 1]} requires key=value, got: ${kv}`); process.exit(1); }
90
const key = kv.slice(0, eq);
91
const raw = kv.slice(eq + 1);
92
// Parse booleans and numbers, keep rest as strings
93
const val = raw === 'true' ? true : raw === 'false' ? false : /^-?\d+(\.\d+)?$/.test(raw) ? Number(raw) : raw;
94
const flag = args[i - 1];
95
if (flag === '--test-setting') { opts.testSettingsOverrides[key] = val; }
96
else if (flag === '--baseline-setting') { opts.baselineSettingsOverrides[key] = val; }
97
else { opts.settingsOverrides[key] = val; }
98
break;
99
}
100
case '--no-cache': opts.noCache = true; break;
101
case '--force': opts.force = true; break;
102
case '--heap-snapshots': opts.heapSnapshots = true; break;
103
case '--ci': opts.ci = true; opts.noCache = true; opts.heapSnapshots = true; opts.cleanupDiagnostics = true; break;
104
case '--cleanup-diagnostics': opts.cleanupDiagnostics = true; break;
105
case '--help': case '-h':
106
console.log([
107
'Chat performance benchmark',
108
'',
109
'Options:',
110
' --runs <n> Number of runs per scenario (default: 5)',
111
' --scenario <id> Scenario to run (repeatable; default: all)',
112
' --build <path|ver> Path to VS Code build, or a version to download',
113
' (e.g. "1.110.0", "insiders", commit hash, or local path)',
114
' --baseline <path> Compare against a baseline JSON file',
115
' --baseline-build <v> Version or path to benchmark as baseline',
116
' (e.g. "1.115.0", "insiders", commit hash, or local path)',
117
' --no-baseline Skip baseline comparison entirely',
118
' --save-baseline Save results as the new baseline (requires --baseline <path>)',
119
' --resume <path> Resume a previous run, adding more iterations to increase',
120
' confidence. Merges new runs with existing rawRuns data',
121
' --threshold <frac> Regression threshold fraction (default: 0.2 = 20%)',
122
' --production-build Build a local bundled package (via gulp vscode) for',
123
' apples-to-apples comparison against a release baseline',
124
' --setting <k=v> Set a VS Code setting override for all builds (repeatable)',
125
' --test-setting <k=v> Set a VS Code setting override for test build only',
126
' --baseline-setting <k=v> Set a VS Code setting override for baseline build only',
127
' e.g. --setting chat.experimental.incrementalRendering.enabled=true',
128
' --no-cache Ignore cached baseline data, always run fresh',
129
' --force Skip build mode mismatch confirmation',
130
' --heap-snapshots Take heap snapshots (slow; auto-enabled in --ci mode)',
131
' --ci CI mode: write Markdown summary to ci-summary.md (implies --no-cache, --heap-snapshots, --cleanup-diagnostics)',
132
' --cleanup-diagnostics Remove heap snapshots, CPU profiles, and traces after each run to save disk space',
133
' --verbose Print per-run details',
134
'',
135
'Scenarios: ' + getScenarioIds().join(', '),
136
].join('\n'));
137
process.exit(0);
138
}
139
}
140
if (opts.scenarios.length === 0) {
141
opts.scenarios = getScenarioIds();
142
} else {
143
const knownIds = new Set(getScenarioIds());
144
const unknown = opts.scenarios.filter(s => !knownIds.has(s));
145
if (unknown.length > 0) {
146
console.error(`Unknown scenario(s): ${unknown.join(', ')}\nAvailable: ${[...knownIds].join(', ')}`);
147
process.exit(1);
148
}
149
}
150
return opts;
151
}
152
153
// -- Build mode detection ----------------------------------------------------
154
155
/**
156
* Classify an electron path into a build mode.
157
* @param {string} electronPath
158
* @returns {'dev' | 'production' | 'release'}
159
*/
160
function detectBuildMode(electronPath) {
161
if (electronPath.includes('.vscode-test')) {
162
return 'release';
163
}
164
if (electronPath.includes('VSCode-')) {
165
return 'production';
166
}
167
return 'dev';
168
}
169
170
/**
171
* Return a human-readable label for a build mode.
172
* @param {'dev' | 'production' | 'release'} mode
173
* @returns {string}
174
*/
175
function buildModeLabel(mode) {
176
switch (mode) {
177
case 'dev': return 'development (unbundled)';
178
case 'production': return 'production (bundled, local)';
179
case 'release': return 'release (bundled, downloaded)';
180
}
181
}
182
183
// -- Production build --------------------------------------------------------
184
185
/**
186
* Build a local production (bundled) VS Code package using `gulp vscode`.
187
* Returns the path to the Electron executable in the packaged output.
188
*
189
* The gulp task compiles TypeScript, bundles JS, and packages with Electron
190
* into `../VSCode-<platform>-<arch>/`. This is the same process used for
191
* release builds, minus minification and mangling.
192
*/
193
function buildProductionBuild() {
194
const product = require(path.join(ROOT, 'product.json'));
195
const platform = process.platform;
196
const arch = process.arch;
197
const destDir = path.join(ROOT, '..', `VSCode-${platform}-${arch}`);
198
199
console.log('[chat-simulation] Building local production package (gulp vscode)...');
200
console.log('[chat-simulation] This may take a few minutes on the first run.');
201
202
const { execSync } = require('child_process');
203
try {
204
execSync('npm run gulp -- vscode', {
205
cwd: ROOT,
206
stdio: 'inherit',
207
timeout: 10 * 60 * 1000, // 10 minute timeout
208
});
209
} catch (e) {
210
// The copilot shim step may fail locally when the copilot SDK is not
211
// fully packaged (it is normally supplied via CI). As long as the
212
// Electron executable was produced we can still benchmark.
213
console.warn('[chat-simulation] gulp vscode exited with errors (see above). Checking if executable was still produced...');
214
}
215
216
/** @type {string} */
217
let electronPath;
218
if (platform === 'darwin') {
219
electronPath = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'MacOS', product.nameShort);
220
} else if (platform === 'linux') {
221
electronPath = path.join(destDir, product.applicationName);
222
} else {
223
electronPath = path.join(destDir, `${product.nameShort}.exe`);
224
}
225
226
if (!fs.existsSync(electronPath)) {
227
console.error(`[chat-simulation] Production build failed — executable not found at: ${electronPath}`);
228
process.exit(1);
229
}
230
231
// Merge product.overrides.json into the packaged product.json.
232
// The overrides file contains extensionsGallery and other config that
233
// the OSS product.json lacks. In dev builds these are loaded at
234
// runtime when VSCODE_DEV is set, but the production build doesn't
235
// set that flag so we bake them in.
236
const overridesPath = path.join(ROOT, 'product.overrides.json');
237
if (fs.existsSync(overridesPath)) {
238
/** @type {string} */
239
let appDir;
240
if (platform === 'darwin') {
241
appDir = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'Resources', 'app');
242
} else {
243
appDir = path.join(destDir, 'resources', 'app');
244
}
245
const packagedProductPath = path.join(appDir, 'product.json');
246
if (fs.existsSync(packagedProductPath)) {
247
const packagedProduct = JSON.parse(fs.readFileSync(packagedProductPath, 'utf-8'));
248
const overrides = JSON.parse(fs.readFileSync(overridesPath, 'utf-8'));
249
const merged = Object.assign(packagedProduct, overrides);
250
fs.writeFileSync(packagedProductPath, JSON.stringify(merged, null, '\t'));
251
console.log('[chat-simulation] Merged product.overrides.json into packaged product.json');
252
}
253
}
254
255
console.log(`[chat-simulation] Production build ready: ${electronPath}`);
256
return electronPath;
257
}
258
259
/**
260
* @typedef {{ type: 'fraction', value: number } | { type: 'absolute', value: number }} MetricThreshold
261
*/
262
263
/**
264
* Parse a metric threshold value from config.
265
* - A number is treated as a fraction (e.g. 0.2 = 20%).
266
* - A string like "100ms" or "5" is treated as an absolute delta.
267
* @param {number | string} raw
268
* @returns {MetricThreshold}
269
*/
270
function parseMetricThreshold(raw) {
271
if (typeof raw === 'number') {
272
return { type: 'fraction', value: raw };
273
}
274
// Strip unit suffix (ms, MB, etc.) and parse the number
275
const num = parseFloat(raw);
276
if (isNaN(num)) {
277
throw new Error(`Invalid metric threshold: ${raw}`);
278
}
279
return { type: 'absolute', value: num };
280
}
281
282
/**
283
* Get the regression threshold for a specific metric.
284
* Uses per-metric override from config if available, otherwise the global threshold.
285
* @param {{ threshold: number, metricThresholds?: Record<string, number | string> }} opts
286
* @param {string} metric
287
* @returns {MetricThreshold}
288
*/
289
function getMetricThreshold(opts, metric) {
290
const raw = opts.metricThresholds?.[metric];
291
if (raw !== undefined) {
292
return parseMetricThreshold(raw);
293
}
294
return { type: 'fraction', value: opts.threshold };
295
}
296
297
/**
298
* Check whether a change exceeds the threshold.
299
* @param {MetricThreshold} threshold
300
* @param {number} change - fractional change (e.g. 0.5 = 50% increase)
301
* @param {number} absoluteDelta - absolute difference (cur.median - bas.median)
302
* @returns {boolean}
303
*/
304
function exceedsThreshold(threshold, change, absoluteDelta) {
305
if (threshold.type === 'absolute') {
306
return absoluteDelta > threshold.value;
307
}
308
return change > threshold.value;
309
}
310
311
// -- Metrics -----------------------------------------------------------------
312
313
/**
314
* @typedef {{
315
* timeToUIUpdated: number,
316
* timeToFirstToken: number,
317
* timeToComplete: number,
318
* timeToRenderComplete: number,
319
* instructionCollectionTime: number,
320
* agentInvokeTime: number,
321
* heapUsedBefore: number,
322
* heapUsedAfter: number,
323
* heapDelta: number,
324
* heapDeltaPostGC: number,
325
* majorGCs: number,
326
* minorGCs: number,
327
* gcDurationMs: number,
328
* layoutCount: number,
329
* layoutDurationMs: number,
330
* recalcStyleCount: number,
331
* forcedReflowCount: number,
332
* longTaskCount: number,
333
* longAnimationFrameCount: number,
334
* longAnimationFrameTotalMs: number,
335
* frameCount: number,
336
* compositeLayers: number,
337
* paintCount: number,
338
* hasInternalMarks: boolean,
339
* responseHasContent: boolean,
340
* internalFirstToken: number,
341
* profilePath: string,
342
* tracePath: string,
343
* snapshotPath: string,
344
* extHostHeapUsedBefore: number,
345
* extHostHeapUsedAfter: number,
346
* extHostHeapDelta: number,
347
* extHostHeapDeltaPostGC: number,
348
* extHostProfilePath: string,
349
* extHostSnapshotPath: string,
350
* }} RunMetrics
351
*/
352
353
// -- Single run --------------------------------------------------------------
354
355
/**
356
* @param {string} electronPath
357
* @param {string} scenario
358
* @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise<void>, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise<void> }} mockServer
359
* @param {boolean} verbose
360
* @param {string} runIndex
361
* @param {string} runDir - timestamped run directory for diagnostics
362
* @param {'baseline' | 'test'} role - whether this is a baseline or test run
363
* @param {Record<string, any>} [settingsOverrides] - custom VS Code settings
364
* @param {{ heapSnapshots?: boolean }} [runOpts] - additional run options
365
* @returns {Promise<RunMetrics>}
366
*/
367
async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role, settingsOverrides, runOpts) {
368
const takeHeapSnapshots = runOpts?.heapSnapshots ?? false;
369
const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer, settingsOverrides);
370
const isDevBuild = !electronPath.includes('.vscode-test') && !electronPath.includes('VSCode-');
371
// Extract a clean build label from the path.
372
// Dev: .build/electron/Code - OSS.app/.../Code - OSS → "dev"
373
// Stable: .vscode-test/vscode-darwin-arm64-1.115.0/Visual Studio Code.app/.../Electron → "1.115.0"
374
// Production: ../VSCode-darwin-arm64/Code - OSS.app/.../Code - OSS → "production"
375
let buildLabel = 'dev';
376
if (!isDevBuild) {
377
const vscodeTestMatch = electronPath.match(/vscode-test\/vscode-[^/]*?-(\d+\.\d+\.\d+)/);
378
if (vscodeTestMatch) {
379
buildLabel = vscodeTestMatch[1];
380
} else if (electronPath.includes('VSCode-')) {
381
buildLabel = 'production';
382
} else {
383
buildLabel = path.basename(electronPath);
384
}
385
}
386
387
// For dev builds from a different repo, derive the repo root from the
388
// electron path so that the build loads its own out/ source code.
389
const appRoot = isDevBuild ? (getRepoRoot(electronPath) || ROOT) : ROOT;
390
if (isDevBuild && appRoot !== ROOT) {
391
if (verbose) {
392
console.log(` [debug] Using appRoot from electron path: ${appRoot}`);
393
}
394
}
395
396
// Create a per-run diagnostics directory: <runDir>/<role>-<build>/<scenario>-<i>/
397
const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, ''));
398
fs.mkdirSync(runDiagDir, { recursive: true });
399
400
const tracePath = path.join(runDiagDir, 'trace.json');
401
const extHostInspectPort = getNextExtHostInspectPort();
402
const vscode = await launchVSCode(
403
electronPath,
404
buildArgs(userDataDir, extDir, logsDir, { isDevBuild, extHostInspectPort, traceFile: tracePath, appRoot }),
405
buildEnv(mockServer, { isDevBuild }),
406
{ verbose },
407
);
408
activeVSCode = vscode;
409
const window = vscode.page;
410
411
// Declared outside try so the finally block can clean up
412
/** @type {{ send: (method: string, params?: any) => Promise<any>, on: (event: string, listener: (params: any) => void) => void, close: () => void } | null} */
413
let extHostInspector = null;
414
/** @type {{ usedSize: number, totalSize: number } | null} */
415
let extHostHeapBefore = null;
416
/** @type {Omit<RunMetrics, 'majorGCs' | 'minorGCs' | 'gcDurationMs' | 'longTaskCount' | 'longAnimationFrameCount' | 'longAnimationFrameTotalMs' | 'timeToUIUpdated' | 'timeToFirstToken' | 'timeToComplete' | 'timeToRenderComplete' | 'layoutDurationMs' | 'instructionCollectionTime' | 'agentInvokeTime' | 'hasInternalMarks' | 'internalFirstToken'> | null} */
417
let partialMetrics = null;
418
// Timing vars hoisted for access in post-close trace parsing
419
let submitTime = 0;
420
let firstResponseTime = 0;
421
let responseCompleteTime = 0;
422
let renderCompleteTime = 0;
423
424
try {
425
await window.waitForSelector('.monaco-workbench', { timeout: 60_000 });
426
427
const cdp = await window.context().newCDPSession(window);
428
await cdp.send('Performance.enable');
429
const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
430
431
const metricsBefore = await cdp.send('Performance.getMetrics');
432
433
// Open chat
434
const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI';
435
await window.keyboard.press(chatShortcut);
436
437
const CHAT_VIEW = 'div[id="workbench.panel.chat"]';
438
const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`;
439
440
await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 });
441
await window.waitForFunction(
442
(selector) => Array.from(document.querySelectorAll(selector)).some(el => {
443
const rect = el.getBoundingClientRect();
444
return rect.width > 0 && rect.height > 0;
445
}),
446
chatEditorSel, { timeout: 15_000 },
447
);
448
449
// Dismiss dialogs
450
const dismissDialog = async () => {
451
for (const sel of ['.chat-setup-dialog', '.dialog-shadow', '.monaco-dialog-box']) {
452
const el = await window.$(sel);
453
if (el) { await window.keyboard.press('Escape'); await new Promise(r => setTimeout(r, 500)); break; }
454
}
455
};
456
await dismissDialog();
457
458
// Wait for extension activation
459
const reqsBefore = mockServer.requestCount();
460
try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { }
461
if (verbose) {
462
console.log(` [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`);
463
}
464
465
// Connect to extension host inspector for profiling/heap data
466
try {
467
extHostInspector = await connectToExtHostInspector(extHostInspectPort, { verbose, timeoutMs: 15_000 });
468
await extHostInspector.send('HeapProfiler.enable');
469
await extHostInspector.send('Profiler.enable');
470
await extHostInspector.send('Profiler.start');
471
extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage');
472
if (verbose && extHostHeapBefore) {
473
console.log(` [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`);
474
}
475
} catch (err) {
476
if (verbose) {
477
console.log(` [ext-host] Could not connect to inspector: ${err}`);
478
}
479
}
480
481
// Wait for model resolution
482
await new Promise(r => setTimeout(r, 3000));
483
await dismissDialog();
484
485
// Focus input
486
await window.click(chatEditorSel);
487
const focusStart = Date.now();
488
while (Date.now() - focusStart < 5_000) {
489
const focused = await window.evaluate((sel) => {
490
const el = document.querySelector(sel);
491
return el && (el.classList.contains('focused') || el.contains(document.activeElement));
492
}, chatEditorSel).catch(() => false);
493
if (focused) { break; }
494
await new Promise(r => setTimeout(r, 50));
495
}
496
497
// Type message — use the smoke-test driver's typeInEditor when available
498
// (dev builds), fall back to pressSequentially for stable/insiders builds.
499
const chatMessage = `[scenario:${scenario}] Explain how this code works`;
500
const actualInputSelector = await window.evaluate((editorSel) => {
501
const editor = document.querySelector(editorSel);
502
if (!editor) { throw new Error('Chat editor not found'); }
503
return editor.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea';
504
}, chatEditorSel);
505
506
const hasDriver = await window.evaluate(() =>
507
// @ts-ignore
508
!!globalThis.driver?.typeInEditor
509
).catch(() => false);
510
511
if (hasDriver) {
512
await window.evaluate(({ selector, text }) => {
513
// @ts-ignore
514
return globalThis.driver.typeInEditor(selector, text);
515
}, { selector: actualInputSelector, text: chatMessage });
516
} else {
517
// Fallback: click the input element and use pressSequentially
518
await window.click(actualInputSelector);
519
await new Promise(r => setTimeout(r, 200));
520
await window.locator(actualInputSelector).pressSequentially(chatMessage, { delay: 0 });
521
}
522
523
// Start CPU profiler to capture call stacks during the interaction
524
await cdp.send('Profiler.enable');
525
await cdp.send('Profiler.start');
526
527
// Submit
528
const completionsBefore = mockServer.completionCount();
529
submitTime = Date.now();
530
await window.keyboard.press('Enter');
531
532
// Wait for mock server to serve the response
533
try { await mockServer.waitForCompletion(completionsBefore + 1, 60_000); } catch { }
534
firstResponseTime = Date.now();
535
536
// Wait for DOM response to settle
537
await dismissDialog();
538
const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`;
539
await window.waitForFunction(
540
(sel) => {
541
const responses = document.querySelectorAll(sel);
542
if (responses.length === 0) { return false; }
543
return !responses[responses.length - 1].classList.contains('chat-response-loading');
544
},
545
responseSelector, { timeout: 30_000 },
546
);
547
responseCompleteTime = Date.now();
548
549
// -- User turn injection loop -----------------------------------------
550
// For multi-turn scenarios with user follow-ups, type each follow-up
551
// message and wait for the model's response to settle.
552
const userTurns = getUserTurns(scenario);
553
for (let ut = 0; ut < userTurns.length; ut++) {
554
const userTurn = userTurns[ut];
555
if (verbose) {
556
console.log(` [debug] User follow-up ${ut + 1}/${userTurns.length}: "${userTurn.message}"`);
557
}
558
559
// Brief pause to let the UI settle between turns
560
await new Promise(r => setTimeout(r, 500));
561
562
// Focus the chat input
563
await window.click(chatEditorSel);
564
const utFocusStart = Date.now();
565
while (Date.now() - utFocusStart < 3_000) {
566
const focused = await window.evaluate((sel) => {
567
const el = document.querySelector(sel);
568
return el && (el.classList.contains('focused') || el.contains(document.activeElement));
569
}, chatEditorSel).catch(() => false);
570
if (focused) { break; }
571
await new Promise(r => setTimeout(r, 50));
572
}
573
574
// Type the follow-up message
575
if (hasDriver) {
576
await window.evaluate(({ selector, text }) => {
577
// @ts-ignore
578
return globalThis.driver.typeInEditor(selector, text);
579
}, { selector: actualInputSelector, text: userTurn.message });
580
} else {
581
await window.click(actualInputSelector);
582
await new Promise(r => setTimeout(r, 200));
583
await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 });
584
}
585
586
// Submit follow-up
587
const utCompBefore = mockServer.completionCount();
588
await window.keyboard.press('Enter');
589
590
// Wait for mock server to serve the response for this turn
591
try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { }
592
593
// Wait for the new response to finish rendering.
594
// The chat list is virtualized — old response elements are
595
// recycled out of the DOM as new ones appear, so we cannot
596
// rely on counting DOM elements. Instead, scroll to the
597
// bottom and wait for no response to be in loading state.
598
await dismissDialog();
599
await window.evaluate((chatViewSel) => {
600
const input = document.querySelector(chatViewSel + ' .interactive-input-part');
601
if (input) { input.scrollIntoView({ block: 'end' }); }
602
}, CHAT_VIEW);
603
await new Promise(r => setTimeout(r, 200));
604
605
await window.waitForFunction(
606
(sel) => {
607
const responses = document.querySelectorAll(sel);
608
if (responses.length === 0) { return false; }
609
return !responses[responses.length - 1].classList.contains('chat-response-loading');
610
},
611
responseSelector,
612
{ timeout: 30_000 },
613
);
614
responseCompleteTime = Date.now();
615
616
if (verbose) {
617
const utResponseInfo = await window.evaluate((sel) => {
618
const responses = document.querySelectorAll(sel);
619
const last = responses[responses.length - 1];
620
return last ? (last.textContent || '').substring(0, 150) : '(empty)';
621
}, responseSelector);
622
console.log(` [debug] Follow-up response (first 150 chars): ${utResponseInfo}`);
623
}
624
}
625
626
// Stop CPU profiler and save the profile
627
const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop'));
628
const profilePath = path.join(runDiagDir, 'profile.cpuprofile');
629
fs.writeFileSync(profilePath, JSON.stringify(profile));
630
if (verbose) {
631
console.log(` [debug] CPU profile saved to ${profilePath}`);
632
}
633
634
const responseInfo = await window.evaluate((sel) => {
635
const responses = document.querySelectorAll(sel);
636
const last = responses[responses.length - 1];
637
if (!last) { return { hasContent: false, text: '' }; }
638
const text = last.textContent || '';
639
return { hasContent: text.trim().length > 0, text: text.substring(0, 200) };
640
}, responseSelector);
641
642
if (verbose) {
643
console.log(` [debug] Response content (first 200 chars): ${responseInfo.text}`);
644
console.log(` [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`);
645
}
646
647
// Wait for the typewriter animation to finish rendering.
648
// The chat UI animates streamed content word-by-word after the
649
// response stream completes. We need to wait until all content
650
// is rendered before capturing layout/style metrics, otherwise
651
// we miss the rendering phase where batching optimizations matter.
652
await window.waitForFunction(
653
(sel) => {
654
const responses = document.querySelectorAll(sel);
655
const last = responses[responses.length - 1];
656
if (!last) { return true; }
657
// The typewriter animation is done when there are no
658
// elements with the 'typewriter' or 'animating' class,
659
// and no pending cursor animations.
660
const hasAnimating = last.querySelector('.chat-animated-word, .chat-typewriter-cursor');
661
return !hasAnimating;
662
},
663
responseSelector,
664
{ timeout: 30_000 },
665
).catch(() => {
666
// Fallback: if the selector-based check doesn't work (e.g.
667
// the CSS classes differ across versions), wait for content
668
// to stabilize by polling textContent.
669
});
670
671
// Additional stabilization: poll until textContent stops changing.
672
// This catches any remaining animation regardless of CSS class names.
673
{
674
let prev = '';
675
let stableCount = 0;
676
const stabilizeStart = Date.now();
677
while (stableCount < 3 && Date.now() - stabilizeStart < 10_000) {
678
const current = await window.evaluate((sel) => {
679
const responses = document.querySelectorAll(sel);
680
const last = responses[responses.length - 1];
681
return last ? (last.textContent || '') : '';
682
}, responseSelector).catch(() => '');
683
if (current === prev) {
684
stableCount++;
685
} else {
686
stableCount = 0;
687
prev = current;
688
}
689
await new Promise(r => setTimeout(r, 100));
690
}
691
}
692
renderCompleteTime = Date.now();
693
if (verbose) {
694
console.log(` [debug] Render stabilized: ${renderCompleteTime - responseCompleteTime}ms after stream complete`);
695
}
696
697
const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
698
const metricsAfter = await cdp.send('Performance.getMetrics');
699
700
// -- Extension host metrics (non-snapshot) ---------------------------
701
let extHostHeapUsedBefore = -1;
702
let extHostHeapUsedAfter = -1;
703
let extHostHeapDelta = -1;
704
let extHostHeapDeltaPostGC = -1;
705
let extHostProfilePath = '';
706
let extHostSnapshotPath = '';
707
if (extHostInspector && extHostHeapBefore) {
708
try {
709
extHostHeapUsedBefore = Math.round(extHostHeapBefore.usedSize / 1024 / 1024);
710
711
// Stop CPU profiler and save
712
const extProfile = await extHostInspector.send('Profiler.stop');
713
extHostProfilePath = path.join(runDiagDir, 'exthost-profile.cpuprofile');
714
fs.writeFileSync(extHostProfilePath, JSON.stringify(extProfile.profile));
715
if (verbose) {
716
console.log(` [ext-host] CPU profile saved to ${extHostProfilePath}`);
717
}
718
719
// Heap usage after interaction
720
const extHostHeapAfter = await extHostInspector.send('Runtime.getHeapUsage');
721
extHostHeapUsedAfter = Math.round(extHostHeapAfter.usedSize / 1024 / 1024);
722
extHostHeapDelta = extHostHeapUsedAfter - extHostHeapUsedBefore;
723
724
// Force GC and measure retained heap
725
try {
726
await extHostInspector.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
727
await new Promise(r => setTimeout(r, 200));
728
const extHostHeapPostGC = await extHostInspector.send('Runtime.getHeapUsage');
729
extHostHeapDeltaPostGC = Math.round(extHostHeapPostGC.usedSize / 1024 / 1024) - extHostHeapUsedBefore;
730
} catch {
731
extHostHeapDeltaPostGC = -1;
732
}
733
734
if (verbose) {
735
console.log(` [ext-host] Heap: before=${extHostHeapUsedBefore}MB, after=${extHostHeapUsedAfter}MB, delta=${extHostHeapDelta}MB, deltaPostGC=${extHostHeapDeltaPostGC}MB`);
736
}
737
} catch (err) {
738
if (verbose) {
739
console.log(` [ext-host] Error collecting metrics: ${err}`);
740
}
741
}
742
}
743
744
// -- Heap snapshots (opt-in, parallelized) ---------------------------
745
let snapshotPath = '';
746
if (takeHeapSnapshots) {
747
const snapshotPromises = [];
748
749
// Renderer snapshot
750
snapshotPromises.push((async () => {
751
const p = path.join(runDiagDir, 'heap.heapsnapshot');
752
await cdp.send('HeapProfiler.enable');
753
const chunks = /** @type {string[]} */ ([]);
754
cdp.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
755
chunks.push(params.chunk);
756
});
757
await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
758
fs.writeFileSync(p, chunks.join(''));
759
return p;
760
})());
761
762
// Extension host snapshot (parallel with renderer)
763
if (extHostInspector && extHostHeapBefore) {
764
snapshotPromises.push((async () => {
765
const p = path.join(runDiagDir, 'exthost-heap.heapsnapshot');
766
const chunks = /** @type {string[]} */ ([]);
767
extHostInspector.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
768
chunks.push(params.chunk);
769
});
770
await extHostInspector.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
771
fs.writeFileSync(p, chunks.join(''));
772
return p;
773
})());
774
}
775
776
const snapshotResults = await Promise.all(snapshotPromises);
777
snapshotPath = snapshotResults[0];
778
if (snapshotResults.length > 1) {
779
extHostSnapshotPath = snapshotResults[1];
780
}
781
782
if (verbose) {
783
console.log(` [debug] Renderer snapshot saved to ${snapshotPath}`);
784
if (extHostSnapshotPath) {
785
console.log(` [ext-host] Snapshot saved to ${extHostSnapshotPath}`);
786
}
787
}
788
}
789
790
// Close ext host inspector now that snapshots (if any) are done
791
if (extHostInspector) {
792
extHostInspector.close();
793
}
794
795
// Store partial metrics here so we can combine with trace data after close.
796
797
/** @param {any} r @param {string} name */
798
function getMetric(r, name) {
799
const e = r.metrics?.find((/** @type {any} */ m) => m.name === name);
800
return e ? e.value : 0;
801
}
802
803
partialMetrics = {
804
heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024),
805
heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024),
806
heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024),
807
heapDeltaPostGC: await (async () => {
808
// Force a full GC then measure heap to get deterministic retained-memory delta.
809
// --js-flags=--expose-gc is not required: CDP's Runtime.evaluate can call gc()
810
// when includeCommandLineAPI is true.
811
try {
812
await cdp.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
813
await new Promise(r => setTimeout(r, 200));
814
const heapPostGC = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
815
return Math.round((heapPostGC.usedSize - heapBefore.usedSize) / 1024 / 1024);
816
} catch {
817
return -1; // gc() not available in this build
818
}
819
})(),
820
layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'),
821
recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'),
822
forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'),
823
frameCount: getMetric(metricsAfter, 'FrameCount') - getMetric(metricsBefore, 'FrameCount'),
824
compositeLayers: getMetric(metricsAfter, 'CompositeLayers') - getMetric(metricsBefore, 'CompositeLayers'),
825
paintCount: getMetric(metricsAfter, 'PaintCount') - getMetric(metricsBefore, 'PaintCount'),
826
responseHasContent: responseInfo.hasContent,
827
profilePath,
828
tracePath,
829
snapshotPath,
830
extHostHeapUsedBefore,
831
extHostHeapUsedAfter,
832
extHostHeapDelta,
833
extHostHeapDeltaPostGC,
834
extHostProfilePath,
835
extHostSnapshotPath,
836
};
837
} finally {
838
if (extHostInspector) {
839
try { extHostInspector.close(); } catch { }
840
}
841
activeVSCode = null;
842
await vscode.close();
843
}
844
845
// Read the trace file written by VS Code on exit via --trace-startup-file
846
/** @type {Array<any>} */
847
let traceEvents = [];
848
try {
849
const traceData = JSON.parse(fs.readFileSync(tracePath, 'utf-8'));
850
traceEvents = traceData.traceEvents || [];
851
} catch {
852
// Trace file may not exist if VS Code crashed before shutdown
853
}
854
855
// Extract code/chat/* perf marks from blink.user_timing trace events.
856
// These appear as instant ('R' or 'I') events with timestamps in microseconds.
857
const chatMarks = traceEvents
858
.filter(e => e.cat === 'blink.user_timing' && e.name && e.name.startsWith('code/chat/'))
859
.map(e => ({ name: e.name, startTime: e.ts / 1000 }));
860
861
if (verbose && chatMarks.length > 0) {
862
console.log(` [trace] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`);
863
}
864
865
// Parse timing — prefer internal code/chat/* marks (precise, in-process)
866
// with client-side Date.now() as fallback for older builds without marks.
867
const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated');
868
const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken');
869
const timeToFirstToken = internalFirstToken >= 0 ? internalFirstToken : (firstResponseTime - submitTime);
870
const timeToComplete = responseCompleteTime - submitTime;
871
const timeToRenderComplete = renderCompleteTime - submitTime;
872
const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions');
873
const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke');
874
875
// Parse GC events from trace.
876
// Use the trace-event category and phase fields which are stable
877
// across V8 versions, rather than matching event name substrings.
878
let majorGCs = 0, minorGCs = 0, gcDurationMs = 0;
879
for (const event of traceEvents) {
880
const isGC = event.cat === 'v8.gc'
881
|| event.cat === 'devtools.timeline,v8'
882
|| (typeof event.cat === 'string' && event.cat.split(',').some((/** @type {string} */ c) => {
883
const t = c.trim();
884
return t === 'v8.gc' || t === 'disabled-by-default-v8.gc' || t === 'disabled-by-default-v8.gc_stats';
885
}));
886
if (!isGC) { continue; }
887
// Only count complete ('X') or duration-begin ('B') events to
888
// avoid double-counting begin/end pairs.
889
if (event.ph && event.ph !== 'X' && event.ph !== 'B') { continue; }
890
const name = event.name || '';
891
if (/Major|MarkCompact|MSC|MC|IncrementalMarking|FinalizeMC/i.test(name)) { majorGCs++; }
892
else if (/Minor|Scaveng/i.test(name)) { minorGCs++; }
893
else { minorGCs++; } // default unknown GC events to minor
894
if (event.dur) { gcDurationMs += event.dur / 1000; }
895
}
896
// Parse Layout duration from devtools.timeline trace events.
897
let layoutDurationMs = 0;
898
for (const event of traceEvents) {
899
if (event.name === 'Layout' && event.ph === 'X' && event.dur) {
900
layoutDurationMs += event.dur / 1000;
901
}
902
}
903
904
let longTaskCount = 0;
905
for (const event of traceEvents) {
906
if (event.name === 'RunTask' && event.dur && event.dur > 50_000) { longTaskCount++; }
907
}
908
909
// Parse Long Animation Frame (LoAF) events from devtools.timeline trace.
910
// AnimationFrame events use async flow pairs (ph:'s' start, ph:'f' finish)
911
// with matching ids. Compute duration from each s→f pair.
912
let longAnimationFrameCount = 0;
913
let longAnimationFrameTotalMs = 0;
914
{
915
/** @type {Map<number, number>} */
916
const frameStarts = new Map();
917
for (const event of traceEvents) {
918
if (event.cat === 'devtools.timeline' && event.name === 'AnimationFrame') {
919
if (event.ph === 's') {
920
frameStarts.set(event.id, event.ts);
921
} else if (event.ph === 'f' && frameStarts.has(event.id)) {
922
const durationMs = (event.ts - /** @type {number} */(frameStarts.get(event.id))) / 1000;
923
frameStarts.delete(event.id);
924
if (durationMs > 50) {
925
longAnimationFrameCount++;
926
longAnimationFrameTotalMs += durationMs;
927
}
928
}
929
}
930
}
931
}
932
933
return {
934
...partialMetrics,
935
timeToUIUpdated, timeToFirstToken, timeToComplete, timeToRenderComplete, instructionCollectionTime, agentInvokeTime,
936
hasInternalMarks: chatMarks.length > 0,
937
internalFirstToken,
938
majorGCs, minorGCs,
939
gcDurationMs: Math.round(gcDurationMs * 100) / 100,
940
layoutDurationMs: Math.round(layoutDurationMs * 100) / 100,
941
longTaskCount,
942
longAnimationFrameCount,
943
longAnimationFrameTotalMs: Math.round(longAnimationFrameTotalMs * 100) / 100,
944
};
945
}
946
947
// -- CI summary generation ---------------------------------------------------
948
949
const GITHUB_REPO = 'https://github.com/microsoft/vscode';
950
951
/**
952
* Format a build identifier as a Markdown link when possible.
953
* - Commit SHAs link to the commit page.
954
* - Semver versions link to the release tag page.
955
* - Everything else (e.g. "baseline", "dev (local)") is returned as inline code.
956
* @param {string} label
957
* @returns {string}
958
*/
959
function formatBuildLink(label) {
960
if (/^[0-9a-f]{7,40}$/.test(label)) {
961
const short = label.substring(0, 7);
962
return `[\`${short}\`](${GITHUB_REPO}/commit/${label})`;
963
}
964
if (/^\d+\.\d+\.\d+/.test(label)) {
965
return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`;
966
}
967
return `\`${label}\``;
968
}
969
970
/**
971
* Build a GitHub compare link between two build identifiers, if both are
972
* commit-like or version-like references. Returns empty string otherwise.
973
* @param {string} base
974
* @param {string} test
975
* @returns {string}
976
*/
977
function formatCompareLink(base, test) {
978
const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v);
979
if (!isRef(base) || !isRef(test)) {
980
return '';
981
}
982
return `[compare](${GITHUB_REPO}/compare/${base}...${test})`;
983
}
984
985
/**
986
* Generate a detailed Markdown summary table for CI.
987
* Printed to stdout and written to ci-summary.md.
988
*
989
* @param {Record<string, any>} jsonReport
990
* @param {Record<string, any> | null} baseline
991
* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, runs: number, baselineBuild?: string, build?: string }} opts
992
*/
993
function generateCISummary(jsonReport, baseline, opts) {
994
const baseLabel = opts.baselineBuild || 'baseline';
995
const testBuildMode = jsonReport.buildMode || 'dev';
996
const testLabel = testBuildMode === 'dev' ? 'dev (local)'
997
: testBuildMode === 'production' ? 'production (local)'
998
: opts.build || testBuildMode;
999
const baseLink = formatBuildLink(baseLabel);
1000
const testLink = formatBuildLink(testLabel);
1001
const compareLink = formatCompareLink(baseLabel, testLabel);
1002
const allMetrics = [
1003
['timeToFirstToken', 'timing', 'ms'],
1004
['timeToComplete', 'timing', 'ms'],
1005
['layoutCount', 'rendering', ''],
1006
['recalcStyleCount', 'rendering', ''],
1007
['forcedReflowCount', 'rendering', ''],
1008
['longTaskCount', 'rendering', ''],
1009
['longAnimationFrameCount', 'rendering', ''],
1010
['longAnimationFrameTotalMs', 'rendering', 'ms'],
1011
['frameCount', 'rendering', ''],
1012
['compositeLayers', 'rendering', ''],
1013
['paintCount', 'rendering', ''],
1014
['heapDelta', 'memory', 'MB'],
1015
['heapDeltaPostGC', 'memory', 'MB'],
1016
['gcDurationMs', 'memory', 'ms'],
1017
['extHostHeapDelta', 'extHost', 'MB'],
1018
['extHostHeapDeltaPostGC', 'extHost', 'MB'],
1019
];
1020
const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']);
1021
1022
const lines = [];
1023
const scenarios = Object.keys(jsonReport.scenarios);
1024
1025
// -- Collect verdicts per scenario/metric --------------------------------
1026
/** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]>} */
1027
const scenarioVerdicts = new Map();
1028
let totalRegressions = 0;
1029
let totalImprovements = 0;
1030
1031
for (const scenario of scenarios) {
1032
const current = jsonReport.scenarios[scenario];
1033
const base = baseline?.scenarios?.[scenario];
1034
/** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */
1035
const verdicts = [];
1036
1037
if (base) {
1038
for (const [metric, group, unit] of allMetrics) {
1039
const cur = current[group]?.[metric];
1040
const bas = base[group]?.[metric];
1041
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
1042
1043
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
1044
const isRegressionMetric = regressionMetricNames.has(metric);
1045
1046
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1047
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1048
const ttest = welchTTest(basRaw, curRaw);
1049
const pStr = ttest ? `${ttest.pValue}` : 'n/a';
1050
1051
const metricThreshold = getMetricThreshold(opts, metric);
1052
const absoluteDelta = cur.median - bas.median;
1053
let verdict = '';
1054
if (isRegressionMetric) {
1055
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
1056
if (!ttest || ttest.significant) {
1057
verdict = 'REGRESSION';
1058
totalRegressions++;
1059
} else {
1060
verdict = 'noise';
1061
}
1062
} else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) {
1063
verdict = 'improved';
1064
totalImprovements++;
1065
} else {
1066
verdict = 'ok';
1067
}
1068
} else {
1069
verdict = 'info';
1070
}
1071
1072
const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;
1073
const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;
1074
verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr });
1075
}
1076
}
1077
scenarioVerdicts.set(scenario, verdicts);
1078
}
1079
1080
// -- Header with verdict up front ----------------------------------------
1081
const hasRegressions = totalRegressions > 0;
1082
const verdictIcon = hasRegressions ? '\u274C' : '\u2705';
1083
const verdictText = hasRegressions
1084
? `${totalRegressions} regression(s) detected`
1085
: totalImprovements > 0
1086
? `No regressions \u2014 ${totalImprovements} improvement(s)`
1087
: 'No significant changes';
1088
1089
lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`);
1090
lines.push('');
1091
lines.push(`| | |`);
1092
lines.push(`|---|---|`);
1093
lines.push(`| **Baseline** | ${baseLink} |`);
1094
lines.push(`| **Test** | ${testLink} |`);
1095
if (compareLink) {
1096
lines.push(`| **Diff** | ${compareLink} |`);
1097
}
1098
lines.push(`| **Runs per scenario** | ${opts.runs} |`);
1099
const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => {
1100
const parsed = parseMetricThreshold(v);
1101
return parsed.type !== 'fraction' || parsed.value !== opts.threshold;
1102
});
1103
if (overrides.length > 0) {
1104
const overrideStr = overrides.map(([k, v]) => {
1105
const parsed = parseMetricThreshold(v);
1106
return `${k}: ${parsed.type === 'absolute' ? `${parsed.value}${k.includes('Ms') || k.includes('Time') || k.includes('time') ? 'ms' : ''}` : `${(parsed.value * 100).toFixed(0)}%`}`;
1107
}).join(', ');
1108
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`);
1109
} else {
1110
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);
1111
}
1112
lines.push(`| **Scenarios** | ${scenarios.length} |`);
1113
lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`);
1114
if (jsonReport.buildMode) {
1115
lines.push(`| **Build mode** | ${jsonReport.buildMode} |`);
1116
}
1117
lines.push('');
1118
if (jsonReport.mismatchedBuildMode) {
1119
lines.push('> **⚠ Build mode mismatch:** The test and baseline builds use different build modes.');
1120
lines.push('> Results may not be directly comparable. For apples-to-apples comparisons,');
1121
lines.push('> use the same build type for both (e.g. `--production-build` with a local');
1122
lines.push('> baseline path, or two version strings).');
1123
lines.push('');
1124
}
1125
1126
// -- At-a-glance overview table: one row per scenario --------------------
1127
lines.push(`## Overview`);
1128
lines.push('');
1129
lines.push('| Scenario | Description | TTFT | Complete | Layouts | Styles | LoAF | Verdict |');
1130
lines.push('|----------|-------------|-----:|---------:|--------:|-------:|-----:|:-------:|');
1131
1132
for (const scenario of scenarios) {
1133
const verdicts = scenarioVerdicts.get(scenario) || [];
1134
const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m);
1135
1136
const ttft = get('timeToFirstToken');
1137
const complete = get('timeToComplete');
1138
const layouts = get('layoutCount');
1139
const styles = get('recalcStyleCount');
1140
const loaf = get('longAnimationFrameCount');
1141
1142
const fmtCell = (/** @type {{ change: number, verdict: string } | undefined} */ v) => {
1143
if (!v) { return '\u2014'; }
1144
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`;
1145
return pct;
1146
};
1147
1148
const fmtVerdict = (/** @type {{ verdict: string, change: number }[]} */ vs) => {
1149
const hasRegression = vs.some(v => v.verdict === 'REGRESSION');
1150
const hasImproved = vs.some(v => v.verdict === 'improved');
1151
if (hasRegression) { return '\u274C Regressed'; }
1152
if (hasImproved) { return '\u2B06\uFE0F Improved'; }
1153
return '\u2705 OK';
1154
};
1155
1156
const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean);
1157
const rowVerdict = fmtVerdict(/** @type {any[]} */(keyVerdicts));
1158
1159
lines.push(`| ${scenario} | ${getScenarioDescription(scenario)} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`);
1160
}
1161
lines.push('');
1162
1163
// -- Regressions & improvements detail section ---------------------------
1164
const hasNotable = [...scenarioVerdicts.values()].some(vs => vs.some(v => v.verdict === 'REGRESSION' || v.verdict === 'improved'));
1165
if (hasNotable) {
1166
lines.push('## Regressions & Improvements');
1167
lines.push('');
1168
lines.push('Only metrics that regressed or improved significantly are shown below.');
1169
lines.push('');
1170
1171
for (const scenario of scenarios) {
1172
const verdicts = scenarioVerdicts.get(scenario) || [];
1173
const notable = verdicts.filter(v => v.verdict === 'REGRESSION' || v.verdict === 'improved');
1174
if (notable.length === 0) { continue; }
1175
1176
const icon = notable.some(v => v.verdict === 'REGRESSION') ? '\u274C' : '\u2B06\uFE0F';
1177
lines.push(`### ${icon} ${scenario}`);
1178
lines.push('');
1179
lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |');
1180
lines.push('|--------|----------|------|--------|---------|---------|');
1181
for (const v of notable) {
1182
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
1183
const verdictIcon = v.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F';
1184
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictIcon} ${v.verdict} |`);
1185
}
1186
lines.push('');
1187
}
1188
}
1189
1190
// -- Full metric tables in collapsible section ---------------------------
1191
lines.push('<details><summary>Full metric details per scenario</summary>');
1192
lines.push('');
1193
1194
for (const scenario of scenarios) {
1195
const verdicts = scenarioVerdicts.get(scenario) || [];
1196
const base = baseline?.scenarios?.[scenario];
1197
1198
lines.push(`### ${scenario}`);
1199
lines.push('');
1200
1201
if (!base) {
1202
const current = jsonReport.scenarios[scenario];
1203
lines.push('> No baseline data for this scenario.');
1204
lines.push('');
1205
lines.push('| Metric | Value | StdDev | CV | n |');
1206
lines.push('|--------|------:|-------:|---:|--:|');
1207
for (const [metric, group, unit] of allMetrics) {
1208
const cur = current[group]?.[metric];
1209
if (!cur) { continue; }
1210
lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`);
1211
}
1212
lines.push('');
1213
continue;
1214
}
1215
1216
lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`);
1217
lines.push(`|--------|----------|------|--------|---------|---------|`);
1218
1219
for (const v of verdicts) {
1220
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
1221
let verdictDisplay = v.verdict;
1222
if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; }
1223
else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; }
1224
else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; }
1225
else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; }
1226
else if (v.verdict === 'info') { verdictDisplay = '\u2139\uFE0F'; }
1227
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`);
1228
}
1229
lines.push('');
1230
}
1231
lines.push('</details>');
1232
lines.push('');
1233
1234
// -- Raw run data in collapsible section ---------------------------------
1235
lines.push('<details><summary>Raw run data</summary>');
1236
lines.push('');
1237
for (const scenario of scenarios) {
1238
const current = jsonReport.scenarios[scenario];
1239
lines.push(`### ${scenario}`);
1240
lines.push('');
1241
lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
1242
lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
1243
const runs = current.rawRuns || [];
1244
for (let i = 0; i < runs.length; i++) {
1245
const r = runs[i];
1246
const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
1247
lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
1248
}
1249
lines.push('');
1250
}
1251
if (baseline) {
1252
for (const scenario of scenarios) {
1253
const base = baseline.scenarios?.[scenario];
1254
if (!base) { continue; }
1255
lines.push(`### ${scenario} (baseline)`);
1256
lines.push('');
1257
lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
1258
lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
1259
const runs = base.rawRuns || [];
1260
for (let i = 0; i < runs.length; i++) {
1261
const r = runs[i];
1262
const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
1263
lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
1264
}
1265
lines.push('');
1266
}
1267
}
1268
lines.push('</details>');
1269
lines.push('');
1270
1271
return lines.join('\n');
1272
}
1273
1274
// -- Cleanup on SIGINT/SIGTERM -----------------------------------------------
1275
1276
/** @type {{ close: () => Promise<void> } | null} */
1277
let activeVSCode = null;
1278
/** @type {{ close: () => Promise<void> } | null} */
1279
let activeMockServer = null;
1280
1281
function installSignalHandlers() {
1282
const cleanup = async () => {
1283
console.log('\n[chat-simulation] Caught interrupt, cleaning up...');
1284
try { await activeVSCode?.close(); } catch { }
1285
try { await activeMockServer?.close(); } catch { }
1286
process.exit(130);
1287
};
1288
process.on('SIGINT', cleanup);
1289
process.on('SIGTERM', cleanup);
1290
}
1291
1292
// -- Diagnostic cleanup ------------------------------------------------------
1293
1294
/**
1295
* Remove large diagnostic files (heap snapshots, CPU profiles, traces) from
1296
* a run's metrics to free disk space. Keeps the JSON results data intact.
1297
* @param {RunMetrics} metrics
1298
*/
1299
function cleanupRunDiagnostics(metrics) {
1300
const filesToDelete = [
1301
metrics.profilePath,
1302
metrics.tracePath,
1303
metrics.snapshotPath,
1304
metrics.extHostProfilePath,
1305
metrics.extHostSnapshotPath,
1306
];
1307
for (const filePath of filesToDelete) {
1308
if (filePath && fs.existsSync(filePath)) {
1309
try {
1310
fs.rmSync(filePath, { force: true });
1311
} catch {
1312
// Ignore cleanup errors
1313
}
1314
}
1315
}
1316
}
1317
1318
/**
1319
* Clean up diagnostics for all scenarios that did NOT regress.
1320
* Keeps diagnostics for regressed scenarios so they can be investigated.
1321
* @param {Record<string, RunMetrics[]>} allResults - test results by scenario
1322
* @param {Set<string>} regressedScenarios - scenarios that regressed
1323
*/
1324
function cleanupNonRegressedDiagnostics(allResults, regressedScenarios) {
1325
for (const [scenario, runs] of Object.entries(allResults)) {
1326
if (regressedScenarios.has(scenario)) {
1327
continue;
1328
}
1329
for (const metrics of runs) {
1330
cleanupRunDiagnostics(metrics);
1331
}
1332
}
1333
}
1334
1335
// -- Main --------------------------------------------------------------------
1336
1337
async function main() {
1338
registerPerfScenarios();
1339
const opts = parseArgs();
1340
1341
installSignalHandlers();
1342
1343
const { startServer } = require('./common/mock-llm-server');
1344
const mockServer = await startServer(0);
1345
activeMockServer = mockServer;
1346
console.log(`[chat-simulation] Mock LLM server: ${mockServer.url}`);
1347
1348
// -- Resume mode --------------------------------------------------------
1349
if (opts.resume) {
1350
if (!fs.existsSync(opts.resume)) {
1351
console.error(`[chat-simulation] Resume file not found: ${opts.resume}`);
1352
process.exit(1);
1353
}
1354
const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8'));
1355
const prevDir = path.dirname(opts.resume);
1356
1357
// Find the associated baseline JSON in the same directory
1358
const baselineFiles = fs.readdirSync(prevDir).filter((/** @type {string} */ f) => f.startsWith('baseline-') && f.endsWith('.json'));
1359
const baselineFile = baselineFiles.length > 0 ? path.join(prevDir, baselineFiles[0]) : null;
1360
const prevBaseline = baselineFile ? JSON.parse(fs.readFileSync(baselineFile, 'utf-8')) : null;
1361
1362
// Determine which scenarios to resume (default: all from previous run)
1363
const resumeScenarios = opts.scenarios.length > 0
1364
? opts.scenarios.filter(s => prevResults.scenarios?.[s])
1365
: Object.keys(prevResults.scenarios || {});
1366
1367
if (resumeScenarios.length === 0) {
1368
console.error('[chat-simulation] No matching scenarios found in previous results');
1369
process.exit(1);
1370
}
1371
1372
const testElectron = await resolveBuild(opts.build);
1373
const baselineVersion = prevBaseline?.baselineBuildVersion;
1374
const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null;
1375
1376
const runsToAdd = opts.runs;
1377
console.log(`[chat-simulation] Resuming from: ${opts.resume}`);
1378
console.log(`[chat-simulation] Adding ${runsToAdd} runs per scenario`);
1379
console.log(`[chat-simulation] Scenarios: ${resumeScenarios.join(', ')}`);
1380
if (prevBaseline) {
1381
console.log(`[chat-simulation] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`);
1382
}
1383
console.log('');
1384
1385
for (const scenario of resumeScenarios) {
1386
console.log(`[chat-simulation] === Resuming: ${scenario} ===`);
1387
const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || [];
1388
const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || [];
1389
1390
// Run additional test iterations
1391
console.log(`[chat-simulation] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`);
1392
for (let i = 0; i < runsToAdd; i++) {
1393
const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`;
1394
console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);
1395
try {
1396
const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test', { ...opts.settingsOverrides, ...opts.testSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
1397
// Clean up previous run's diagnostics to bound disk usage; keep the latest
1398
if (opts.cleanupDiagnostics && prevTestRuns.length > 0) { cleanupRunDiagnostics(prevTestRuns[prevTestRuns.length - 1]); }
1399
prevTestRuns.push(m);
1400
if (opts.verbose) {
1401
const src = m.hasInternalMarks ? 'internal' : 'client-side';
1402
console.log(` [${src}] firstToken=${m.timeToFirstToken}ms, complete=${m.timeToComplete}ms`);
1403
}
1404
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
1405
}
1406
1407
// Run additional baseline iterations
1408
if (baselineElectron && prevBaseline?.scenarios?.[scenario]) {
1409
console.log(`[chat-simulation] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`);
1410
for (let i = 0; i < runsToAdd; i++) {
1411
const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`;
1412
console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);
1413
try {
1414
const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline', { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
1415
// Clean up previous run's diagnostics to bound disk usage; keep the latest
1416
if (opts.cleanupDiagnostics && prevBaseRuns.length > 0) { cleanupRunDiagnostics(prevBaseRuns[prevBaseRuns.length - 1]); }
1417
prevBaseRuns.push(m);
1418
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
1419
}
1420
}
1421
1422
// Recompute stats with merged data
1423
const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevTestRuns });
1424
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); }
1425
prevResults.scenarios[scenario] = sd;
1426
1427
if (prevBaseline?.scenarios?.[scenario]) {
1428
const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevBaseRuns });
1429
for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); }
1430
prevBaseline.scenarios[scenario] = bsd;
1431
}
1432
console.log(`[chat-simulation] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`);
1433
console.log('');
1434
}
1435
1436
// Write updated files back
1437
prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs));
1438
prevResults.lastResumed = new Date().toISOString();
1439
fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2));
1440
console.log(`[chat-simulation] Updated results: ${opts.resume}`);
1441
1442
if (prevBaseline && baselineFile) {
1443
prevBaseline.lastResumed = new Date().toISOString();
1444
fs.writeFileSync(baselineFile, JSON.stringify(prevBaseline, null, 2));
1445
// Also update cached baseline
1446
const cachedPath = path.join(DATA_DIR, path.basename(baselineFile));
1447
fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2));
1448
console.log(`[chat-simulation] Updated baseline: ${baselineFile}`);
1449
}
1450
1451
// -- Re-run comparison with merged data --------------------------------
1452
opts.baseline = baselineFile || undefined;
1453
const jsonReport = prevResults;
1454
jsonReport._resultsPath = opts.resume;
1455
1456
// Fall through to comparison logic below
1457
await printComparison(jsonReport, opts);
1458
await mockServer.close();
1459
return;
1460
}
1461
1462
// -- Normal (non-resume) flow -------------------------------------------
1463
// --production-build: build a local bundled (non-dev) package from the
1464
// current source tree using `gulp vscode`. This produces the same
1465
// packaging as a release build (bundled JS, no VSCODE_DEV) while still
1466
// testing your local changes.
1467
if (opts.productionBuild && !opts.build) {
1468
const prodBuildPath = buildProductionBuild();
1469
opts.build = prodBuildPath;
1470
console.log(`[chat-simulation] --production-build: using local production build at ${prodBuildPath}`);
1471
}
1472
1473
const electronPath = await resolveBuild(opts.build);
1474
1475
if (!fs.existsSync(electronPath)) {
1476
console.error(`Electron not found at: ${electronPath}`);
1477
console.error('Run "node build/lib/preLaunch.ts" first, or pass --build <path>');
1478
process.exit(1);
1479
}
1480
1481
// Detect build modes for both test and baseline builds
1482
const testBuildMode = detectBuildMode(electronPath);
1483
1484
// Resolve the baseline build path early so we can detect its mode.
1485
// For version strings this downloads; for local paths it resolves directly.
1486
const isBaselineVersionString = opts.baselineBuild && isVersionString(opts.baselineBuild);
1487
const isBaselineLocalPath = opts.baselineBuild && !isBaselineVersionString;
1488
/** @type {string | undefined} */
1489
let baselineElectronPath;
1490
if (isBaselineLocalPath) {
1491
baselineElectronPath = await resolveBuild(opts.baselineBuild);
1492
if (!fs.existsSync(baselineElectronPath)) {
1493
console.error(`Baseline build not found at: ${baselineElectronPath}`);
1494
process.exit(1);
1495
}
1496
}
1497
const baselineBuildMode = opts.baselineBuild
1498
? (isBaselineVersionString ? 'release' : detectBuildMode(baselineElectronPath || ''))
1499
: undefined;
1500
1501
const isMismatchedBuildMode = baselineBuildMode !== undefined && testBuildMode !== baselineBuildMode;
1502
1503
// Create a timestamped run directory for all output
1504
const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
1505
const runDir = path.join(DATA_DIR, runTimestamp);
1506
fs.mkdirSync(runDir, { recursive: true });
1507
console.log(`[chat-simulation] Output: ${runDir}`);
1508
1509
// Compute effective settings per role
1510
const testSettings = { ...opts.settingsOverrides, ...opts.testSettingsOverrides };
1511
const baselineSettings = { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides };
1512
1513
// -- Baseline build --------------------------------------------------
1514
if (opts.baselineBuild) {
1515
// Use a sanitized label for file names — replace path separators for local paths
1516
const baselineLabel = isBaselineLocalPath
1517
? path.basename(path.resolve(opts.baselineBuild))
1518
: opts.baselineBuild;
1519
const baselineJsonPath = path.join(runDir, `baseline-${baselineLabel}.json`);
1520
1521
// Local paths: always run fresh (no caching — the build may have changed)
1522
// Version strings: use caching as before
1523
const cachedPath = isBaselineLocalPath ? null : path.join(DATA_DIR, `baseline-${baselineLabel}.json`);
1524
const cachedBaseline = cachedPath && !opts.noCache && fs.existsSync(cachedPath)
1525
? JSON.parse(fs.readFileSync(cachedPath, 'utf-8'))
1526
: null;
1527
1528
if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) {
1529
// Check if the cache covers all requested scenarios
1530
const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {}));
1531
const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s));
1532
1533
// Also check if cached scenarios have fewer runs than requested
1534
const shortScenarios = opts.scenarios.filter((/** @type {string} */ s) => {
1535
const cached = cachedBaseline.scenarios?.[s];
1536
return cached && (cached.rawRuns?.length || 0) < opts.runs;
1537
});
1538
1539
if (missingScenarios.length === 0 && shortScenarios.length === 0) {
1540
console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`);
1541
fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
1542
opts.baseline = baselineJsonPath;
1543
} else {
1544
const scenariosToRun = [...new Set([...missingScenarios, ...shortScenarios])];
1545
if (missingScenarios.length > 0) {
1546
console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`);
1547
}
1548
if (shortScenarios.length > 0) {
1549
console.log(`[chat-simulation] Cached baseline needs more runs for: ${shortScenarios.map((/** @type {string} */ s) => `${s} (${cachedBaseline.scenarios[s].rawRuns?.length || 0}/${opts.runs})`).join(', ')}`);
1550
}
1551
console.log(`[chat-simulation] Running baseline for ${scenariosToRun.length} scenario(s)...`);
1552
const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
1553
for (const scenario of scenariosToRun) {
1554
const existingRuns = cachedBaseline.scenarios?.[scenario]?.rawRuns || [];
1555
const runsNeeded = opts.runs - existingRuns.length;
1556
/** @type {RunMetrics[]} */
1557
const newResults = [];
1558
for (let i = 0; i < runsNeeded; i++) {
1559
try {
1560
const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${existingRuns.length + i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
1561
// Clean up previous run's diagnostics to bound disk usage; keep the latest
1562
if (opts.cleanupDiagnostics && newResults.length > 0) { cleanupRunDiagnostics(newResults[newResults.length - 1]); }
1563
newResults.push(m);
1564
}
1565
catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }
1566
}
1567
const allRuns = [...existingRuns, ...newResults];
1568
if (allRuns.length > 0) {
1569
const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: allRuns });
1570
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); }
1571
cachedBaseline.scenarios[scenario] = sd;
1572
}
1573
}
1574
cachedBaseline.runsPerScenario = opts.runs;
1575
fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
1576
if (cachedPath) {
1577
fs.writeFileSync(cachedPath, JSON.stringify(cachedBaseline, null, 2));
1578
}
1579
opts.baseline = baselineJsonPath;
1580
}
1581
} else {
1582
const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
1583
console.log(`[chat-simulation] Benchmarking baseline build (${baselineLabel})...`);
1584
/** @type {Record<string, RunMetrics[]>} */
1585
const baselineResults = {};
1586
for (const scenario of opts.scenarios) {
1587
/** @type {RunMetrics[]} */
1588
const results = [];
1589
for (let i = 0; i < opts.runs; i++) {
1590
try {
1591
const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
1592
// Clean up previous run's diagnostics to bound disk usage; keep the latest
1593
if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
1594
results.push(m);
1595
}
1596
catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }
1597
}
1598
if (results.length > 0) { baselineResults[scenario] = results; }
1599
}
1600
const baselineReport = {
1601
timestamp: new Date().toISOString(),
1602
baselineBuildVersion: opts.baselineBuild,
1603
platform: process.platform,
1604
runsPerScenario: opts.runs,
1605
scenarios: /** @type {Record<string, any>} */ ({}),
1606
};
1607
for (const [scenario, results] of Object.entries(baselineResults)) {
1608
const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
1609
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
1610
baselineReport.scenarios[scenario] = sd;
1611
}
1612
fs.writeFileSync(baselineJsonPath, JSON.stringify(baselineReport, null, 2));
1613
// Cache at the top level for reuse across runs (version strings only)
1614
if (cachedPath) {
1615
fs.writeFileSync(cachedPath, JSON.stringify(baselineReport, null, 2));
1616
}
1617
opts.baseline = baselineJsonPath;
1618
}
1619
console.log('');
1620
}
1621
1622
// -- Run benchmarks --------------------------------------------------
1623
console.log(`[chat-simulation] Electron: ${electronPath}`);
1624
console.log(`[chat-simulation] Build mode: ${buildModeLabel(testBuildMode)}`);
1625
if (baselineBuildMode) {
1626
console.log(`[chat-simulation] Baseline mode: ${buildModeLabel(baselineBuildMode)}`);
1627
}
1628
console.log(`[chat-simulation] Runs per scenario: ${opts.runs}`);
1629
console.log(`[chat-simulation] Scenarios: ${opts.scenarios.join(', ')}`);
1630
if (Object.keys(opts.settingsOverrides).length > 0) {
1631
console.log(`[chat-simulation] Settings overrides (all): ${JSON.stringify(opts.settingsOverrides)}`);
1632
}
1633
if (Object.keys(opts.testSettingsOverrides).length > 0) {
1634
console.log(`[chat-simulation] Settings overrides (test): ${JSON.stringify(opts.testSettingsOverrides)}`);
1635
}
1636
if (Object.keys(opts.baselineSettingsOverrides).length > 0) {
1637
console.log(`[chat-simulation] Settings overrides (baseline): ${JSON.stringify(opts.baselineSettingsOverrides)}`);
1638
}
1639
1640
if (isMismatchedBuildMode) {
1641
console.log('');
1642
console.log(`[chat-simulation] ⚠ WARNING: Build mode mismatch — test is ${testBuildMode}, baseline is ${baselineBuildMode}.`);
1643
console.log('[chat-simulation] Results may not be directly comparable. For apples-to-apples');
1644
console.log('[chat-simulation] comparisons, use the same build type for both.');
1645
if (testBuildMode === 'dev') {
1646
console.log('[chat-simulation] To use a local production build instead:');
1647
console.log('[chat-simulation] npm run perf:chat -- --production-build');
1648
}
1649
if (!opts.ci && !opts.force) {
1650
const readline = require('readline');
1651
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
1652
const answer = await new Promise(resolve => rl.question('[chat-simulation] Continue anyway? [y/N] ', resolve));
1653
rl.close();
1654
if (String(answer).toLowerCase() !== 'y') {
1655
console.log('[chat-simulation] Aborted.');
1656
await mockServer.close();
1657
process.exit(0);
1658
}
1659
}
1660
}
1661
console.log('');
1662
1663
/** @type {Record<string, RunMetrics[]>} */
1664
const allResults = {};
1665
let anyFailed = false;
1666
1667
for (const scenario of opts.scenarios) {
1668
console.log(`[chat-simulation] === Scenario: ${scenario} ===`);
1669
/** @type {RunMetrics[]} */
1670
const results = [];
1671
for (let i = 0; i < opts.runs; i++) {
1672
console.log(`[chat-simulation] Run ${i + 1}/${opts.runs}...`);
1673
try {
1674
const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test', testSettings, { heapSnapshots: opts.heapSnapshots });
1675
// Clean up previous run's diagnostics to bound disk usage; keep the latest
1676
if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
1677
results.push(metrics);
1678
if (opts.verbose) {
1679
const src = metrics.hasInternalMarks ? 'internal' : 'client-side';
1680
console.log(` [${src}] firstToken=${metrics.timeToFirstToken}ms, complete=${metrics.timeToComplete}ms, heap=delta${metrics.heapDelta}MB, longTasks=${metrics.longTaskCount}${metrics.hasInternalMarks ? `, internalTTFT=${metrics.internalFirstToken}ms` : ''}`);
1681
}
1682
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
1683
}
1684
if (results.length === 0) { console.error(`[chat-simulation] All runs failed for scenario: ${scenario}`); anyFailed = true; }
1685
else { allResults[scenario] = results; }
1686
console.log('');
1687
}
1688
1689
// -- Summary ---------------------------------------------------------
1690
console.log('[chat-simulation] ======================= Summary =======================');
1691
for (const [scenario, results] of Object.entries(allResults)) {
1692
console.log('');
1693
console.log(` -- ${scenario} (${results.length} runs) --`);
1694
console.log('');
1695
console.log(' Timing:');
1696
console.log(summarize(results.map(r => r.timeToFirstToken), ' Request → First token ', 'ms'));
1697
console.log(summarize(results.map(r => r.timeToComplete), ' Request → Complete ', 'ms'));
1698
console.log(summarize(results.map(r => r.timeToRenderComplete), ' Request → Rendered ', 'ms'));
1699
console.log('');
1700
console.log(' Rendering:');
1701
console.log(summarize(results.map(r => r.layoutCount), ' Layouts ', ''));
1702
console.log(summarize(results.map(r => r.layoutDurationMs), ' Layout duration ', 'ms'));
1703
console.log(summarize(results.map(r => r.recalcStyleCount), ' Style recalcs ', ''));
1704
console.log(summarize(results.map(r => r.forcedReflowCount), ' Forced reflows ', ''));
1705
console.log(summarize(results.map(r => r.longTaskCount), ' Long tasks (>50ms) ', ''));
1706
console.log(summarize(results.map(r => r.longAnimationFrameCount), ' Long anim. frames ', ''));
1707
console.log(summarize(results.map(r => r.longAnimationFrameTotalMs), ' LoAF total duration ', 'ms'));
1708
console.log(summarize(results.map(r => r.frameCount), ' Frames ', ''));
1709
console.log(summarize(results.map(r => r.compositeLayers), ' Composite layers ', ''));
1710
console.log(summarize(results.map(r => r.paintCount), ' Paints ', ''));
1711
console.log('');
1712
console.log(' Memory:');
1713
console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB'));
1714
console.log(summarize(results.map(r => r.heapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));
1715
console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms'));
1716
if (results.some(r => r.extHostHeapDelta >= 0)) {
1717
console.log('');
1718
console.log(' Extension Host:');
1719
console.log(summarize(results.map(r => r.extHostHeapUsedBefore), ' Heap before ', 'MB'));
1720
console.log(summarize(results.map(r => r.extHostHeapUsedAfter), ' Heap after ', 'MB'));
1721
console.log(summarize(results.map(r => r.extHostHeapDelta), ' Heap delta ', 'MB'));
1722
console.log(summarize(results.map(r => r.extHostHeapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));
1723
}
1724
}
1725
1726
// -- JSON output -----------------------------------------------------
1727
const jsonPath = path.join(runDir, 'results.json');
1728
const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, buildMode: string, mismatchedBuildMode: boolean, scenarios: Record<string, any>, _resultsPath?: string }} */ ({
1729
timestamp: new Date().toISOString(),
1730
platform: process.platform,
1731
runsPerScenario: opts.runs,
1732
buildMode: testBuildMode,
1733
mismatchedBuildMode: !!isMismatchedBuildMode,
1734
scenarios: /** @type {Record<string, any>} */ ({}),
1735
});
1736
for (const [scenario, results] of Object.entries(allResults)) {
1737
const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
1738
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
1739
jsonReport.scenarios[scenario] = sd;
1740
}
1741
fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2));
1742
jsonReport._resultsPath = jsonPath;
1743
console.log('');
1744
console.log(`[chat-simulation] Results written to ${jsonPath}`);
1745
1746
// -- Save baseline ---------------------------------------------------
1747
if (opts.saveBaseline) {
1748
if (!opts.baseline) { console.error('[chat-simulation] --save-baseline requires --baseline <path>'); process.exit(1); }
1749
fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2));
1750
console.log(`[chat-simulation] Baseline saved to ${opts.baseline}`);
1751
}
1752
1753
// -- Baseline comparison ---------------------------------------------
1754
const regressedScenarios = await printComparison(jsonReport, opts);
1755
1756
// Clean up diagnostics for scenarios that did not regress
1757
if (opts.cleanupDiagnostics) {
1758
cleanupNonRegressedDiagnostics(allResults, regressedScenarios);
1759
}
1760
1761
if (anyFailed) { process.exit(1); }
1762
await mockServer.close();
1763
}
1764
1765
/**
1766
* Print baseline comparison and exit with code 1 if regressions found.
1767
* Returns the set of scenario IDs that regressed.
1768
* @param {Record<string, any>} jsonReport
1769
* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, baseline?: string, ci?: boolean, resume?: string, build?: string, baselineBuild?: string, runs: number, cleanupDiagnostics?: boolean }} opts
1770
* @returns {Promise<Set<string>>}
1771
*/
1772
async function printComparison(jsonReport, opts) {
1773
let regressionFound = false;
1774
let inconclusiveFound = false;
1775
/** @type {Set<string>} */
1776
const regressedScenarios = new Set();
1777
if (opts.baseline && fs.existsSync(opts.baseline)) {
1778
const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'));
1779
console.log('');
1780
console.log(`[chat-simulation] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`);
1781
console.log(`[chat-simulation] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`);
1782
if (jsonReport.mismatchedBuildMode) {
1783
console.log(`[chat-simulation] ⚠ Note: build mode mismatch — test is ${jsonReport.buildMode}, baseline differs.`);
1784
console.log('[chat-simulation] Results may not be directly comparable.');
1785
}
1786
console.log('');
1787
1788
// Metrics that trigger regression failure when they exceed the threshold
1789
const regressionMetrics = [
1790
// [metric, group, unit]
1791
['timeToFirstToken', 'timing', 'ms'],
1792
['timeToComplete', 'timing', 'ms'],
1793
['layoutCount', 'rendering', ''],
1794
['recalcStyleCount', 'rendering', ''],
1795
['forcedReflowCount', 'rendering', ''],
1796
['longTaskCount', 'rendering', ''],
1797
];
1798
// Informational metrics — shown in comparison but don't trigger failure
1799
const infoMetrics = [
1800
['heapDelta', 'memory', 'MB'],
1801
['gcDurationMs', 'memory', 'ms'],
1802
['extHostHeapDelta', 'extHost', 'MB'],
1803
['extHostHeapDeltaPostGC', 'extHost', 'MB'],
1804
];
1805
1806
for (const scenario of Object.keys(jsonReport.scenarios)) {
1807
const current = jsonReport.scenarios[scenario];
1808
const base = baseline.scenarios?.[scenario];
1809
if (!base) { console.log(` ${scenario}: (no baseline)`); continue; }
1810
1811
/** @type {string[]} */
1812
const diffs = [];
1813
let scenarioRegression = false;
1814
1815
for (const [metric, group, unit] of regressionMetrics) {
1816
const cur = current[group]?.[metric];
1817
const bas = base[group]?.[metric];
1818
if (!cur || !bas || !bas.median) { continue; }
1819
const change = (cur.median - bas.median) / bas.median;
1820
const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
1821
1822
// Statistical significance via Welch's t-test on raw run values
1823
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1824
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1825
const ttest = welchTTest(basRaw, curRaw);
1826
1827
const metricThreshold = getMetricThreshold(opts, metric);
1828
const absoluteDelta = cur.median - bas.median;
1829
let flag = '';
1830
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
1831
if (!ttest) {
1832
flag = ' ← possible regression (n too small for significance test)';
1833
inconclusiveFound = true;
1834
} else if (ttest.significant) {
1835
flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`;
1836
scenarioRegression = true;
1837
regressionFound = true;
1838
} else {
1839
flag = ` (likely noise — p=${ttest.pValue}, not significant)`;
1840
inconclusiveFound = true;
1841
}
1842
} else if (ttest && change > 0 && ttest.significant && ttest.confidence === 'high') {
1843
flag = ` (significant increase, p=${ttest.pValue})`;
1844
}
1845
diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct})${flag}`);
1846
}
1847
for (const [metric, group, unit] of infoMetrics) {
1848
const cur = current[group]?.[metric];
1849
const bas = base[group]?.[metric];
1850
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
1851
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
1852
const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
1853
diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct}) [info]`);
1854
}
1855
console.log(` ${scenario}: ${scenarioRegression ? 'FAIL' : 'OK'}`);
1856
if (scenarioRegression) { regressedScenarios.add(scenario); }
1857
diffs.forEach(d => console.log(d));
1858
}
1859
1860
console.log('');
1861
console.log(regressionFound
1862
? `[chat-simulation] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance`
1863
: `[chat-simulation] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`);
1864
1865
if (inconclusiveFound && !regressionFound) {
1866
// Find the results.json path to suggest in the hint
1867
const resultsPath = Object.keys(jsonReport.scenarios).length > 0
1868
? (jsonReport._resultsPath || opts.resume || 'path/to/results.json')
1869
: 'path/to/results.json';
1870
// Estimate required runs from the observed effect size and variance
1871
// using power analysis for Welch's t-test (alpha=0.05, 80% power).
1872
// n_per_group = 2 * ((z_alpha/2 + z_beta) / d)^2 where d = Cohen's d
1873
let maxNeeded = 0;
1874
for (const scenario of Object.keys(jsonReport.scenarios)) {
1875
const current = jsonReport.scenarios[scenario];
1876
const base = baseline.scenarios?.[scenario];
1877
if (!base) { continue; }
1878
for (const [metric, group] of [['timeToFirstToken', 'timing'], ['timeToComplete', 'timing'], ['layoutCount', 'rendering'], ['recalcStyleCount', 'rendering']]) {
1879
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1880
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
1881
if (curRaw.length < 2 || basRaw.length < 2) { continue; }
1882
const meanA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / basRaw.length;
1883
const meanB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / curRaw.length;
1884
const varA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanA) ** 2, 0) / (basRaw.length - 1);
1885
const varB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanB) ** 2, 0) / (curRaw.length - 1);
1886
const pooledSD = Math.sqrt((varA + varB) / 2);
1887
if (pooledSD === 0) { continue; }
1888
const d = Math.abs(meanB - meanA) / pooledSD;
1889
if (d === 0) { continue; }
1890
// z_0.025 = 1.96, z_0.2 = 0.842
1891
const nPerGroup = Math.ceil(2 * ((1.96 + 0.842) / d) ** 2);
1892
const currentN = Math.min(curRaw.length, basRaw.length);
1893
maxNeeded = Math.max(maxNeeded, nPerGroup - currentN);
1894
}
1895
}
1896
const suggestedRuns = Math.max(1, Math.min(maxNeeded, 20));
1897
console.log('');
1898
console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.');
1899
console.log('[chat-simulation] To increase confidence, add more runs with --resume:');
1900
console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs ${suggestedRuns}`);
1901
}
1902
}
1903
1904
// -- CI summary ------------------------------------------------------
1905
if (opts.ci) {
1906
const ciBaseline = opts.baseline && fs.existsSync(opts.baseline)
1907
? JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'))
1908
: null;
1909
const summary = generateCISummary(jsonReport, ciBaseline, {
1910
threshold: opts.threshold,
1911
metricThresholds: opts.metricThresholds,
1912
runs: jsonReport.runsPerScenario || opts.runs,
1913
baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild,
1914
build: opts.build,
1915
});
1916
1917
// Write to file for GitHub Actions $GITHUB_STEP_SUMMARY
1918
const summaryPath = path.join(DATA_DIR, 'ci-summary.md');
1919
fs.writeFileSync(summaryPath, summary);
1920
console.log(`[chat-simulation] CI summary written to ${summaryPath}`);
1921
1922
// Also print the full summary table to stdout
1923
console.log('');
1924
console.log('==================================================================');
1925
console.log(' CHAT PERF COMPARISON RESULTS ');
1926
console.log('==================================================================');
1927
console.log('');
1928
console.log(summary);
1929
}
1930
1931
if (regressionFound) { process.exit(1); }
1932
return regressedScenarios;
1933
}
1934
1935
main().catch(err => { console.error(err); process.exit(1); });
1936
1937