Path: blob/main/scripts/chat-simulation/test-chat-perf-regression.js
13379 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45// @ts-check67/**8* Chat performance benchmark.9*10* Uses the real copilot extension with IS_SCENARIO_AUTOMATION=1 and a local11* mock LLM server. Measures the full stack: prompt building, context12* gathering, tool resolution, rendering, GC, and layout overhead.13*14* Usage:15* npm run perf:chat # all scenarios vs 1.115.016* npm run perf:chat -- --runs 10 # 10 runs per scenario17* npm run perf:chat -- --scenario text-only # single scenario18* npm run perf:chat -- --no-baseline # skip baseline comparison19* npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.020* npm run perf:chat -- --resume .chat-simulation-data/2026-04-14/results.json --runs 321*/2223const path = require('path');24const fs = require('fs');25const {26ROOT, DATA_DIR, METRIC_DEFS, loadConfig,27resolveBuild, isVersionString, buildEnv, buildArgs, prepareRunDir,28robustStats, welchTTest, summarize, markDuration, launchVSCode,29getNextExtHostInspectPort, connectToExtHostInspector, getRepoRoot,30} = require('./common/utils');31const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server');32const { registerPerfScenarios, getScenarioDescription } = require('./common/perf-scenarios');3334// -- Config (edit config.jsonc to change defaults) ---------------------------3536const CONFIG = loadConfig('perfRegression');3738// -- CLI args ----------------------------------------------------------------3940function parseArgs() {41const args = process.argv.slice(2);42const opts = {43runs: CONFIG.runsPerScenario ?? 5,44verbose: false,45ci: false,46noCache: false,47force: false,48heapSnapshots: false,49/** @type {string[]} */50scenarios: [],51/** @type {string | undefined} */52build: undefined,53/** @type {string | undefined} */54baseline: undefined,55/** @type {string | undefined} */56baselineBuild: CONFIG.baselineBuild ?? '1.115.0',57saveBaseline: false,58threshold: CONFIG.regressionThreshold ?? 0.2,59/** @type {Record<string, number | string>} */60metricThresholds: CONFIG.metricThresholds ?? {},61/** @type {string | undefined} */62resume: undefined,63productionBuild: false,64/** @type {Record<string, any>} */65settingsOverrides: {},66/** @type {Record<string, any>} */67testSettingsOverrides: {},68/** @type {Record<string, any>} */69baselineSettingsOverrides: {},70cleanupDiagnostics: false,71};72for (let i = 0; i < args.length; i++) {73switch (args[i]) {74case '--runs': opts.runs = parseInt(args[++i], 10); break;75case '--verbose': opts.verbose = true; break;76case '--scenario': case '-s': opts.scenarios.push(args[++i]); break;77case '--build': case '-b': opts.build = args[++i]; break;78case '--baseline': opts.baseline = args[++i]; break;79case '--baseline-build': opts.baselineBuild = args[++i]; break;80case '--no-baseline': opts.baselineBuild = undefined; break;81case '--save-baseline': opts.saveBaseline = true; break;82case '--threshold': opts.threshold = parseFloat(args[++i]); break;83case '--resume': opts.resume = args[++i]; break;84case '--production-build': opts.productionBuild = true; break;85case '--setting': case '--test-setting': case '--baseline-setting': {86const kv = args[++i];87const eq = kv.indexOf('=');88if (eq === -1) { console.error(`${args[i - 1]} requires key=value, got: ${kv}`); process.exit(1); }89const key = kv.slice(0, eq);90const raw = kv.slice(eq + 1);91// Parse booleans and numbers, keep rest as strings92const val = raw === 'true' ? true : raw === 'false' ? false : /^-?\d+(\.\d+)?$/.test(raw) ? Number(raw) : raw;93const flag = args[i - 1];94if (flag === '--test-setting') { opts.testSettingsOverrides[key] = val; }95else if (flag === '--baseline-setting') { opts.baselineSettingsOverrides[key] = val; }96else { opts.settingsOverrides[key] = val; }97break;98}99case '--no-cache': opts.noCache = true; break;100case '--force': opts.force = true; break;101case '--heap-snapshots': opts.heapSnapshots = true; break;102case '--ci': opts.ci = true; opts.noCache = true; opts.heapSnapshots = true; opts.cleanupDiagnostics = true; break;103case '--cleanup-diagnostics': opts.cleanupDiagnostics = true; break;104case '--help': case '-h':105console.log([106'Chat performance benchmark',107'',108'Options:',109' --runs <n> Number of runs per scenario (default: 5)',110' --scenario <id> Scenario to run (repeatable; default: all)',111' --build <path|ver> Path to VS Code build, or a version to download',112' (e.g. "1.110.0", "insiders", commit hash, or local path)',113' --baseline <path> Compare against a baseline JSON file',114' --baseline-build <v> Version or path to benchmark as baseline',115' (e.g. "1.115.0", "insiders", commit hash, or local path)',116' --no-baseline Skip baseline comparison entirely',117' --save-baseline Save results as the new baseline (requires --baseline <path>)',118' --resume <path> Resume a previous run, adding more iterations to increase',119' confidence. Merges new runs with existing rawRuns data',120' --threshold <frac> Regression threshold fraction (default: 0.2 = 20%)',121' --production-build Build a local bundled package (via gulp vscode) for',122' apples-to-apples comparison against a release baseline',123' --setting <k=v> Set a VS Code setting override for all builds (repeatable)',124' --test-setting <k=v> Set a VS Code setting override for test build only',125' --baseline-setting <k=v> Set a VS Code setting override for baseline build only',126' e.g. --setting chat.experimental.incrementalRendering.enabled=true',127' --no-cache Ignore cached baseline data, always run fresh',128' --force Skip build mode mismatch confirmation',129' --heap-snapshots Take heap snapshots (slow; auto-enabled in --ci mode)',130' --ci CI mode: write Markdown summary to ci-summary.md (implies --no-cache, --heap-snapshots, --cleanup-diagnostics)',131' --cleanup-diagnostics Remove heap snapshots, CPU profiles, and traces after each run to save disk space',132' --verbose Print per-run details',133'',134'Scenarios: ' + getScenarioIds().join(', '),135].join('\n'));136process.exit(0);137}138}139if (opts.scenarios.length === 0) {140opts.scenarios = getScenarioIds();141} else {142const knownIds = new Set(getScenarioIds());143const unknown = opts.scenarios.filter(s => !knownIds.has(s));144if (unknown.length > 0) {145console.error(`Unknown scenario(s): ${unknown.join(', ')}\nAvailable: ${[...knownIds].join(', ')}`);146process.exit(1);147}148}149return opts;150}151152// -- Build mode detection ----------------------------------------------------153154/**155* Classify an electron path into a build mode.156* @param {string} electronPath157* @returns {'dev' | 'production' | 'release'}158*/159function detectBuildMode(electronPath) {160if (electronPath.includes('.vscode-test')) {161return 'release';162}163if (electronPath.includes('VSCode-')) {164return 'production';165}166return 'dev';167}168169/**170* Return a human-readable label for a build mode.171* @param {'dev' | 'production' | 'release'} mode172* @returns {string}173*/174function buildModeLabel(mode) {175switch (mode) {176case 'dev': return 'development (unbundled)';177case 'production': return 'production (bundled, local)';178case 'release': return 'release (bundled, downloaded)';179}180}181182// -- Production build --------------------------------------------------------183184/**185* Build a local production (bundled) VS Code package using `gulp vscode`.186* Returns the path to the Electron executable in the packaged output.187*188* The gulp task compiles TypeScript, bundles JS, and packages with Electron189* into `../VSCode-<platform>-<arch>/`. This is the same process used for190* release builds, minus minification and mangling.191*/192function buildProductionBuild() {193const product = require(path.join(ROOT, 'product.json'));194const platform = process.platform;195const arch = process.arch;196const destDir = path.join(ROOT, '..', `VSCode-${platform}-${arch}`);197198console.log('[chat-simulation] Building local production package (gulp vscode)...');199console.log('[chat-simulation] This may take a few minutes on the first run.');200201const { execSync } = require('child_process');202try {203execSync('npm run gulp -- vscode', {204cwd: ROOT,205stdio: 'inherit',206timeout: 10 * 60 * 1000, // 10 minute timeout207});208} catch (e) {209// The copilot shim step may fail locally when the copilot SDK is not210// fully packaged (it is normally supplied via CI). As long as the211// Electron executable was produced we can still benchmark.212console.warn('[chat-simulation] gulp vscode exited with errors (see above). Checking if executable was still produced...');213}214215/** @type {string} */216let electronPath;217if (platform === 'darwin') {218electronPath = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'MacOS', product.nameShort);219} else if (platform === 'linux') {220electronPath = path.join(destDir, product.applicationName);221} else {222electronPath = path.join(destDir, `${product.nameShort}.exe`);223}224225if (!fs.existsSync(electronPath)) {226console.error(`[chat-simulation] Production build failed — executable not found at: ${electronPath}`);227process.exit(1);228}229230// Merge product.overrides.json into the packaged product.json.231// The overrides file contains extensionsGallery and other config that232// the OSS product.json lacks. In dev builds these are loaded at233// runtime when VSCODE_DEV is set, but the production build doesn't234// set that flag so we bake them in.235const overridesPath = path.join(ROOT, 'product.overrides.json');236if (fs.existsSync(overridesPath)) {237/** @type {string} */238let appDir;239if (platform === 'darwin') {240appDir = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'Resources', 'app');241} else {242appDir = path.join(destDir, 'resources', 'app');243}244const packagedProductPath = path.join(appDir, 'product.json');245if (fs.existsSync(packagedProductPath)) {246const packagedProduct = JSON.parse(fs.readFileSync(packagedProductPath, 'utf-8'));247const overrides = JSON.parse(fs.readFileSync(overridesPath, 'utf-8'));248const merged = Object.assign(packagedProduct, overrides);249fs.writeFileSync(packagedProductPath, JSON.stringify(merged, null, '\t'));250console.log('[chat-simulation] Merged product.overrides.json into packaged product.json');251}252}253254console.log(`[chat-simulation] Production build ready: ${electronPath}`);255return electronPath;256}257258/**259* @typedef {{ type: 'fraction', value: number } | { type: 'absolute', value: number }} MetricThreshold260*/261262/**263* Parse a metric threshold value from config.264* - A number is treated as a fraction (e.g. 0.2 = 20%).265* - A string like "100ms" or "5" is treated as an absolute delta.266* @param {number | string} raw267* @returns {MetricThreshold}268*/269function parseMetricThreshold(raw) {270if (typeof raw === 'number') {271return { type: 'fraction', value: raw };272}273// Strip unit suffix (ms, MB, etc.) and parse the number274const num = parseFloat(raw);275if (isNaN(num)) {276throw new Error(`Invalid metric threshold: ${raw}`);277}278return { type: 'absolute', value: num };279}280281/**282* Get the regression threshold for a specific metric.283* Uses per-metric override from config if available, otherwise the global threshold.284* @param {{ threshold: number, metricThresholds?: Record<string, number | string> }} opts285* @param {string} metric286* @returns {MetricThreshold}287*/288function getMetricThreshold(opts, metric) {289const raw = opts.metricThresholds?.[metric];290if (raw !== undefined) {291return parseMetricThreshold(raw);292}293return { type: 'fraction', value: opts.threshold };294}295296/**297* Check whether a change exceeds the threshold.298* @param {MetricThreshold} threshold299* @param {number} change - fractional change (e.g. 0.5 = 50% increase)300* @param {number} absoluteDelta - absolute difference (cur.median - bas.median)301* @returns {boolean}302*/303function exceedsThreshold(threshold, change, absoluteDelta) {304if (threshold.type === 'absolute') {305return absoluteDelta > threshold.value;306}307return change > threshold.value;308}309310// -- Metrics -----------------------------------------------------------------311312/**313* @typedef {{314* timeToUIUpdated: number,315* timeToFirstToken: number,316* timeToComplete: number,317* timeToRenderComplete: number,318* instructionCollectionTime: number,319* agentInvokeTime: number,320* heapUsedBefore: number,321* heapUsedAfter: number,322* heapDelta: number,323* heapDeltaPostGC: number,324* majorGCs: number,325* minorGCs: number,326* gcDurationMs: number,327* layoutCount: number,328* layoutDurationMs: number,329* recalcStyleCount: number,330* forcedReflowCount: number,331* longTaskCount: number,332* longAnimationFrameCount: number,333* longAnimationFrameTotalMs: number,334* frameCount: number,335* compositeLayers: number,336* paintCount: number,337* hasInternalMarks: boolean,338* responseHasContent: boolean,339* internalFirstToken: number,340* profilePath: string,341* tracePath: string,342* snapshotPath: string,343* extHostHeapUsedBefore: number,344* extHostHeapUsedAfter: number,345* extHostHeapDelta: number,346* extHostHeapDeltaPostGC: number,347* extHostProfilePath: string,348* extHostSnapshotPath: string,349* }} RunMetrics350*/351352// -- Single run --------------------------------------------------------------353354/**355* @param {string} electronPath356* @param {string} scenario357* @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise<void>, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise<void> }} mockServer358* @param {boolean} verbose359* @param {string} runIndex360* @param {string} runDir - timestamped run directory for diagnostics361* @param {'baseline' | 'test'} role - whether this is a baseline or test run362* @param {Record<string, any>} [settingsOverrides] - custom VS Code settings363* @param {{ heapSnapshots?: boolean }} [runOpts] - additional run options364* @returns {Promise<RunMetrics>}365*/366async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role, settingsOverrides, runOpts) {367const takeHeapSnapshots = runOpts?.heapSnapshots ?? false;368const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer, settingsOverrides);369const isDevBuild = !electronPath.includes('.vscode-test') && !electronPath.includes('VSCode-');370// Extract a clean build label from the path.371// Dev: .build/electron/Code - OSS.app/.../Code - OSS → "dev"372// Stable: .vscode-test/vscode-darwin-arm64-1.115.0/Visual Studio Code.app/.../Electron → "1.115.0"373// Production: ../VSCode-darwin-arm64/Code - OSS.app/.../Code - OSS → "production"374let buildLabel = 'dev';375if (!isDevBuild) {376const vscodeTestMatch = electronPath.match(/vscode-test\/vscode-[^/]*?-(\d+\.\d+\.\d+)/);377if (vscodeTestMatch) {378buildLabel = vscodeTestMatch[1];379} else if (electronPath.includes('VSCode-')) {380buildLabel = 'production';381} else {382buildLabel = path.basename(electronPath);383}384}385386// For dev builds from a different repo, derive the repo root from the387// electron path so that the build loads its own out/ source code.388const appRoot = isDevBuild ? (getRepoRoot(electronPath) || ROOT) : ROOT;389if (isDevBuild && appRoot !== ROOT) {390if (verbose) {391console.log(` [debug] Using appRoot from electron path: ${appRoot}`);392}393}394395// Create a per-run diagnostics directory: <runDir>/<role>-<build>/<scenario>-<i>/396const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, ''));397fs.mkdirSync(runDiagDir, { recursive: true });398399const tracePath = path.join(runDiagDir, 'trace.json');400const extHostInspectPort = getNextExtHostInspectPort();401const vscode = await launchVSCode(402electronPath,403buildArgs(userDataDir, extDir, logsDir, { isDevBuild, extHostInspectPort, traceFile: tracePath, appRoot }),404buildEnv(mockServer, { isDevBuild }),405{ verbose },406);407activeVSCode = vscode;408const window = vscode.page;409410// Declared outside try so the finally block can clean up411/** @type {{ send: (method: string, params?: any) => Promise<any>, on: (event: string, listener: (params: any) => void) => void, close: () => void } | null} */412let extHostInspector = null;413/** @type {{ usedSize: number, totalSize: number } | null} */414let extHostHeapBefore = null;415/** @type {Omit<RunMetrics, 'majorGCs' | 'minorGCs' | 'gcDurationMs' | 'longTaskCount' | 'longAnimationFrameCount' | 'longAnimationFrameTotalMs' | 'timeToUIUpdated' | 'timeToFirstToken' | 'timeToComplete' | 'timeToRenderComplete' | 'layoutDurationMs' | 'instructionCollectionTime' | 'agentInvokeTime' | 'hasInternalMarks' | 'internalFirstToken'> | null} */416let partialMetrics = null;417// Timing vars hoisted for access in post-close trace parsing418let submitTime = 0;419let firstResponseTime = 0;420let responseCompleteTime = 0;421let renderCompleteTime = 0;422423try {424await window.waitForSelector('.monaco-workbench', { timeout: 60_000 });425426const cdp = await window.context().newCDPSession(window);427await cdp.send('Performance.enable');428const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));429430const metricsBefore = await cdp.send('Performance.getMetrics');431432// Open chat433const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI';434await window.keyboard.press(chatShortcut);435436const CHAT_VIEW = 'div[id="workbench.panel.chat"]';437const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`;438439await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 });440await window.waitForFunction(441(selector) => Array.from(document.querySelectorAll(selector)).some(el => {442const rect = el.getBoundingClientRect();443return rect.width > 0 && rect.height > 0;444}),445chatEditorSel, { timeout: 15_000 },446);447448// Dismiss dialogs449const dismissDialog = async () => {450for (const sel of ['.chat-setup-dialog', '.dialog-shadow', '.monaco-dialog-box']) {451const el = await window.$(sel);452if (el) { await window.keyboard.press('Escape'); await new Promise(r => setTimeout(r, 500)); break; }453}454};455await dismissDialog();456457// Wait for extension activation458const reqsBefore = mockServer.requestCount();459try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { }460if (verbose) {461console.log(` [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`);462}463464// Connect to extension host inspector for profiling/heap data465try {466extHostInspector = await connectToExtHostInspector(extHostInspectPort, { verbose, timeoutMs: 15_000 });467await extHostInspector.send('HeapProfiler.enable');468await extHostInspector.send('Profiler.enable');469await extHostInspector.send('Profiler.start');470extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage');471if (verbose && extHostHeapBefore) {472console.log(` [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`);473}474} catch (err) {475if (verbose) {476console.log(` [ext-host] Could not connect to inspector: ${err}`);477}478}479480// Wait for model resolution481await new Promise(r => setTimeout(r, 3000));482await dismissDialog();483484// Focus input485await window.click(chatEditorSel);486const focusStart = Date.now();487while (Date.now() - focusStart < 5_000) {488const focused = await window.evaluate((sel) => {489const el = document.querySelector(sel);490return el && (el.classList.contains('focused') || el.contains(document.activeElement));491}, chatEditorSel).catch(() => false);492if (focused) { break; }493await new Promise(r => setTimeout(r, 50));494}495496// Type message — use the smoke-test driver's typeInEditor when available497// (dev builds), fall back to pressSequentially for stable/insiders builds.498const chatMessage = `[scenario:${scenario}] Explain how this code works`;499const actualInputSelector = await window.evaluate((editorSel) => {500const editor = document.querySelector(editorSel);501if (!editor) { throw new Error('Chat editor not found'); }502return editor.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea';503}, chatEditorSel);504505const hasDriver = await window.evaluate(() =>506// @ts-ignore507!!globalThis.driver?.typeInEditor508).catch(() => false);509510if (hasDriver) {511await window.evaluate(({ selector, text }) => {512// @ts-ignore513return globalThis.driver.typeInEditor(selector, text);514}, { selector: actualInputSelector, text: chatMessage });515} else {516// Fallback: click the input element and use pressSequentially517await window.click(actualInputSelector);518await new Promise(r => setTimeout(r, 200));519await window.locator(actualInputSelector).pressSequentially(chatMessage, { delay: 0 });520}521522// Start CPU profiler to capture call stacks during the interaction523await cdp.send('Profiler.enable');524await cdp.send('Profiler.start');525526// Submit527const completionsBefore = mockServer.completionCount();528submitTime = Date.now();529await window.keyboard.press('Enter');530531// Wait for mock server to serve the response532try { await mockServer.waitForCompletion(completionsBefore + 1, 60_000); } catch { }533firstResponseTime = Date.now();534535// Wait for DOM response to settle536await dismissDialog();537const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`;538await window.waitForFunction(539(sel) => {540const responses = document.querySelectorAll(sel);541if (responses.length === 0) { return false; }542return !responses[responses.length - 1].classList.contains('chat-response-loading');543},544responseSelector, { timeout: 30_000 },545);546responseCompleteTime = Date.now();547548// -- User turn injection loop -----------------------------------------549// For multi-turn scenarios with user follow-ups, type each follow-up550// message and wait for the model's response to settle.551const userTurns = getUserTurns(scenario);552for (let ut = 0; ut < userTurns.length; ut++) {553const userTurn = userTurns[ut];554if (verbose) {555console.log(` [debug] User follow-up ${ut + 1}/${userTurns.length}: "${userTurn.message}"`);556}557558// Brief pause to let the UI settle between turns559await new Promise(r => setTimeout(r, 500));560561// Focus the chat input562await window.click(chatEditorSel);563const utFocusStart = Date.now();564while (Date.now() - utFocusStart < 3_000) {565const focused = await window.evaluate((sel) => {566const el = document.querySelector(sel);567return el && (el.classList.contains('focused') || el.contains(document.activeElement));568}, chatEditorSel).catch(() => false);569if (focused) { break; }570await new Promise(r => setTimeout(r, 50));571}572573// Type the follow-up message574if (hasDriver) {575await window.evaluate(({ selector, text }) => {576// @ts-ignore577return globalThis.driver.typeInEditor(selector, text);578}, { selector: actualInputSelector, text: userTurn.message });579} else {580await window.click(actualInputSelector);581await new Promise(r => setTimeout(r, 200));582await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 });583}584585// Submit follow-up586const utCompBefore = mockServer.completionCount();587await window.keyboard.press('Enter');588589// Wait for mock server to serve the response for this turn590try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { }591592// Wait for the new response to finish rendering.593// The chat list is virtualized — old response elements are594// recycled out of the DOM as new ones appear, so we cannot595// rely on counting DOM elements. Instead, scroll to the596// bottom and wait for no response to be in loading state.597await dismissDialog();598await window.evaluate((chatViewSel) => {599const input = document.querySelector(chatViewSel + ' .interactive-input-part');600if (input) { input.scrollIntoView({ block: 'end' }); }601}, CHAT_VIEW);602await new Promise(r => setTimeout(r, 200));603604await window.waitForFunction(605(sel) => {606const responses = document.querySelectorAll(sel);607if (responses.length === 0) { return false; }608return !responses[responses.length - 1].classList.contains('chat-response-loading');609},610responseSelector,611{ timeout: 30_000 },612);613responseCompleteTime = Date.now();614615if (verbose) {616const utResponseInfo = await window.evaluate((sel) => {617const responses = document.querySelectorAll(sel);618const last = responses[responses.length - 1];619return last ? (last.textContent || '').substring(0, 150) : '(empty)';620}, responseSelector);621console.log(` [debug] Follow-up response (first 150 chars): ${utResponseInfo}`);622}623}624625// Stop CPU profiler and save the profile626const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop'));627const profilePath = path.join(runDiagDir, 'profile.cpuprofile');628fs.writeFileSync(profilePath, JSON.stringify(profile));629if (verbose) {630console.log(` [debug] CPU profile saved to ${profilePath}`);631}632633const responseInfo = await window.evaluate((sel) => {634const responses = document.querySelectorAll(sel);635const last = responses[responses.length - 1];636if (!last) { return { hasContent: false, text: '' }; }637const text = last.textContent || '';638return { hasContent: text.trim().length > 0, text: text.substring(0, 200) };639}, responseSelector);640641if (verbose) {642console.log(` [debug] Response content (first 200 chars): ${responseInfo.text}`);643console.log(` [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`);644}645646// Wait for the typewriter animation to finish rendering.647// The chat UI animates streamed content word-by-word after the648// response stream completes. We need to wait until all content649// is rendered before capturing layout/style metrics, otherwise650// we miss the rendering phase where batching optimizations matter.651await window.waitForFunction(652(sel) => {653const responses = document.querySelectorAll(sel);654const last = responses[responses.length - 1];655if (!last) { return true; }656// The typewriter animation is done when there are no657// elements with the 'typewriter' or 'animating' class,658// and no pending cursor animations.659const hasAnimating = last.querySelector('.chat-animated-word, .chat-typewriter-cursor');660return !hasAnimating;661},662responseSelector,663{ timeout: 30_000 },664).catch(() => {665// Fallback: if the selector-based check doesn't work (e.g.666// the CSS classes differ across versions), wait for content667// to stabilize by polling textContent.668});669670// Additional stabilization: poll until textContent stops changing.671// This catches any remaining animation regardless of CSS class names.672{673let prev = '';674let stableCount = 0;675const stabilizeStart = Date.now();676while (stableCount < 3 && Date.now() - stabilizeStart < 10_000) {677const current = await window.evaluate((sel) => {678const responses = document.querySelectorAll(sel);679const last = responses[responses.length - 1];680return last ? (last.textContent || '') : '';681}, responseSelector).catch(() => '');682if (current === prev) {683stableCount++;684} else {685stableCount = 0;686prev = current;687}688await new Promise(r => setTimeout(r, 100));689}690}691renderCompleteTime = Date.now();692if (verbose) {693console.log(` [debug] Render stabilized: ${renderCompleteTime - responseCompleteTime}ms after stream complete`);694}695696const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));697const metricsAfter = await cdp.send('Performance.getMetrics');698699// -- Extension host metrics (non-snapshot) ---------------------------700let extHostHeapUsedBefore = -1;701let extHostHeapUsedAfter = -1;702let extHostHeapDelta = -1;703let extHostHeapDeltaPostGC = -1;704let extHostProfilePath = '';705let extHostSnapshotPath = '';706if (extHostInspector && extHostHeapBefore) {707try {708extHostHeapUsedBefore = Math.round(extHostHeapBefore.usedSize / 1024 / 1024);709710// Stop CPU profiler and save711const extProfile = await extHostInspector.send('Profiler.stop');712extHostProfilePath = path.join(runDiagDir, 'exthost-profile.cpuprofile');713fs.writeFileSync(extHostProfilePath, JSON.stringify(extProfile.profile));714if (verbose) {715console.log(` [ext-host] CPU profile saved to ${extHostProfilePath}`);716}717718// Heap usage after interaction719const extHostHeapAfter = await extHostInspector.send('Runtime.getHeapUsage');720extHostHeapUsedAfter = Math.round(extHostHeapAfter.usedSize / 1024 / 1024);721extHostHeapDelta = extHostHeapUsedAfter - extHostHeapUsedBefore;722723// Force GC and measure retained heap724try {725await extHostInspector.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });726await new Promise(r => setTimeout(r, 200));727const extHostHeapPostGC = await extHostInspector.send('Runtime.getHeapUsage');728extHostHeapDeltaPostGC = Math.round(extHostHeapPostGC.usedSize / 1024 / 1024) - extHostHeapUsedBefore;729} catch {730extHostHeapDeltaPostGC = -1;731}732733if (verbose) {734console.log(` [ext-host] Heap: before=${extHostHeapUsedBefore}MB, after=${extHostHeapUsedAfter}MB, delta=${extHostHeapDelta}MB, deltaPostGC=${extHostHeapDeltaPostGC}MB`);735}736} catch (err) {737if (verbose) {738console.log(` [ext-host] Error collecting metrics: ${err}`);739}740}741}742743// -- Heap snapshots (opt-in, parallelized) ---------------------------744let snapshotPath = '';745if (takeHeapSnapshots) {746const snapshotPromises = [];747748// Renderer snapshot749snapshotPromises.push((async () => {750const p = path.join(runDiagDir, 'heap.heapsnapshot');751await cdp.send('HeapProfiler.enable');752const chunks = /** @type {string[]} */ ([]);753cdp.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {754chunks.push(params.chunk);755});756await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });757fs.writeFileSync(p, chunks.join(''));758return p;759})());760761// Extension host snapshot (parallel with renderer)762if (extHostInspector && extHostHeapBefore) {763snapshotPromises.push((async () => {764const p = path.join(runDiagDir, 'exthost-heap.heapsnapshot');765const chunks = /** @type {string[]} */ ([]);766extHostInspector.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {767chunks.push(params.chunk);768});769await extHostInspector.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });770fs.writeFileSync(p, chunks.join(''));771return p;772})());773}774775const snapshotResults = await Promise.all(snapshotPromises);776snapshotPath = snapshotResults[0];777if (snapshotResults.length > 1) {778extHostSnapshotPath = snapshotResults[1];779}780781if (verbose) {782console.log(` [debug] Renderer snapshot saved to ${snapshotPath}`);783if (extHostSnapshotPath) {784console.log(` [ext-host] Snapshot saved to ${extHostSnapshotPath}`);785}786}787}788789// Close ext host inspector now that snapshots (if any) are done790if (extHostInspector) {791extHostInspector.close();792}793794// Store partial metrics here so we can combine with trace data after close.795796/** @param {any} r @param {string} name */797function getMetric(r, name) {798const e = r.metrics?.find((/** @type {any} */ m) => m.name === name);799return e ? e.value : 0;800}801802partialMetrics = {803heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024),804heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024),805heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024),806heapDeltaPostGC: await (async () => {807// Force a full GC then measure heap to get deterministic retained-memory delta.808// --js-flags=--expose-gc is not required: CDP's Runtime.evaluate can call gc()809// when includeCommandLineAPI is true.810try {811await cdp.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });812await new Promise(r => setTimeout(r, 200));813const heapPostGC = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));814return Math.round((heapPostGC.usedSize - heapBefore.usedSize) / 1024 / 1024);815} catch {816return -1; // gc() not available in this build817}818})(),819layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'),820recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'),821forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'),822frameCount: getMetric(metricsAfter, 'FrameCount') - getMetric(metricsBefore, 'FrameCount'),823compositeLayers: getMetric(metricsAfter, 'CompositeLayers') - getMetric(metricsBefore, 'CompositeLayers'),824paintCount: getMetric(metricsAfter, 'PaintCount') - getMetric(metricsBefore, 'PaintCount'),825responseHasContent: responseInfo.hasContent,826profilePath,827tracePath,828snapshotPath,829extHostHeapUsedBefore,830extHostHeapUsedAfter,831extHostHeapDelta,832extHostHeapDeltaPostGC,833extHostProfilePath,834extHostSnapshotPath,835};836} finally {837if (extHostInspector) {838try { extHostInspector.close(); } catch { }839}840activeVSCode = null;841await vscode.close();842}843844// Read the trace file written by VS Code on exit via --trace-startup-file845/** @type {Array<any>} */846let traceEvents = [];847try {848const traceData = JSON.parse(fs.readFileSync(tracePath, 'utf-8'));849traceEvents = traceData.traceEvents || [];850} catch {851// Trace file may not exist if VS Code crashed before shutdown852}853854// Extract code/chat/* perf marks from blink.user_timing trace events.855// These appear as instant ('R' or 'I') events with timestamps in microseconds.856const chatMarks = traceEvents857.filter(e => e.cat === 'blink.user_timing' && e.name && e.name.startsWith('code/chat/'))858.map(e => ({ name: e.name, startTime: e.ts / 1000 }));859860if (verbose && chatMarks.length > 0) {861console.log(` [trace] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`);862}863864// Parse timing — prefer internal code/chat/* marks (precise, in-process)865// with client-side Date.now() as fallback for older builds without marks.866const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated');867const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken');868const timeToFirstToken = internalFirstToken >= 0 ? internalFirstToken : (firstResponseTime - submitTime);869const timeToComplete = responseCompleteTime - submitTime;870const timeToRenderComplete = renderCompleteTime - submitTime;871const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions');872const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke');873874// Parse GC events from trace.875// Use the trace-event category and phase fields which are stable876// across V8 versions, rather than matching event name substrings.877let majorGCs = 0, minorGCs = 0, gcDurationMs = 0;878for (const event of traceEvents) {879const isGC = event.cat === 'v8.gc'880|| event.cat === 'devtools.timeline,v8'881|| (typeof event.cat === 'string' && event.cat.split(',').some((/** @type {string} */ c) => {882const t = c.trim();883return t === 'v8.gc' || t === 'disabled-by-default-v8.gc' || t === 'disabled-by-default-v8.gc_stats';884}));885if (!isGC) { continue; }886// Only count complete ('X') or duration-begin ('B') events to887// avoid double-counting begin/end pairs.888if (event.ph && event.ph !== 'X' && event.ph !== 'B') { continue; }889const name = event.name || '';890if (/Major|MarkCompact|MSC|MC|IncrementalMarking|FinalizeMC/i.test(name)) { majorGCs++; }891else if (/Minor|Scaveng/i.test(name)) { minorGCs++; }892else { minorGCs++; } // default unknown GC events to minor893if (event.dur) { gcDurationMs += event.dur / 1000; }894}895// Parse Layout duration from devtools.timeline trace events.896let layoutDurationMs = 0;897for (const event of traceEvents) {898if (event.name === 'Layout' && event.ph === 'X' && event.dur) {899layoutDurationMs += event.dur / 1000;900}901}902903let longTaskCount = 0;904for (const event of traceEvents) {905if (event.name === 'RunTask' && event.dur && event.dur > 50_000) { longTaskCount++; }906}907908// Parse Long Animation Frame (LoAF) events from devtools.timeline trace.909// AnimationFrame events use async flow pairs (ph:'s' start, ph:'f' finish)910// with matching ids. Compute duration from each s→f pair.911let longAnimationFrameCount = 0;912let longAnimationFrameTotalMs = 0;913{914/** @type {Map<number, number>} */915const frameStarts = new Map();916for (const event of traceEvents) {917if (event.cat === 'devtools.timeline' && event.name === 'AnimationFrame') {918if (event.ph === 's') {919frameStarts.set(event.id, event.ts);920} else if (event.ph === 'f' && frameStarts.has(event.id)) {921const durationMs = (event.ts - /** @type {number} */(frameStarts.get(event.id))) / 1000;922frameStarts.delete(event.id);923if (durationMs > 50) {924longAnimationFrameCount++;925longAnimationFrameTotalMs += durationMs;926}927}928}929}930}931932return {933...partialMetrics,934timeToUIUpdated, timeToFirstToken, timeToComplete, timeToRenderComplete, instructionCollectionTime, agentInvokeTime,935hasInternalMarks: chatMarks.length > 0,936internalFirstToken,937majorGCs, minorGCs,938gcDurationMs: Math.round(gcDurationMs * 100) / 100,939layoutDurationMs: Math.round(layoutDurationMs * 100) / 100,940longTaskCount,941longAnimationFrameCount,942longAnimationFrameTotalMs: Math.round(longAnimationFrameTotalMs * 100) / 100,943};944}945946// -- CI summary generation ---------------------------------------------------947948const GITHUB_REPO = 'https://github.com/microsoft/vscode';949950/**951* Format a build identifier as a Markdown link when possible.952* - Commit SHAs link to the commit page.953* - Semver versions link to the release tag page.954* - Everything else (e.g. "baseline", "dev (local)") is returned as inline code.955* @param {string} label956* @returns {string}957*/958function formatBuildLink(label) {959if (/^[0-9a-f]{7,40}$/.test(label)) {960const short = label.substring(0, 7);961return `[\`${short}\`](${GITHUB_REPO}/commit/${label})`;962}963if (/^\d+\.\d+\.\d+/.test(label)) {964return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`;965}966return `\`${label}\``;967}968969/**970* Build a GitHub compare link between two build identifiers, if both are971* commit-like or version-like references. Returns empty string otherwise.972* @param {string} base973* @param {string} test974* @returns {string}975*/976function formatCompareLink(base, test) {977const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v);978if (!isRef(base) || !isRef(test)) {979return '';980}981return `[compare](${GITHUB_REPO}/compare/${base}...${test})`;982}983984/**985* Generate a detailed Markdown summary table for CI.986* Printed to stdout and written to ci-summary.md.987*988* @param {Record<string, any>} jsonReport989* @param {Record<string, any> | null} baseline990* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, runs: number, baselineBuild?: string, build?: string }} opts991*/992function generateCISummary(jsonReport, baseline, opts) {993const baseLabel = opts.baselineBuild || 'baseline';994const testBuildMode = jsonReport.buildMode || 'dev';995const testLabel = testBuildMode === 'dev' ? 'dev (local)'996: testBuildMode === 'production' ? 'production (local)'997: opts.build || testBuildMode;998const baseLink = formatBuildLink(baseLabel);999const testLink = formatBuildLink(testLabel);1000const compareLink = formatCompareLink(baseLabel, testLabel);1001const allMetrics = [1002['timeToFirstToken', 'timing', 'ms'],1003['timeToComplete', 'timing', 'ms'],1004['layoutCount', 'rendering', ''],1005['recalcStyleCount', 'rendering', ''],1006['forcedReflowCount', 'rendering', ''],1007['longTaskCount', 'rendering', ''],1008['longAnimationFrameCount', 'rendering', ''],1009['longAnimationFrameTotalMs', 'rendering', 'ms'],1010['frameCount', 'rendering', ''],1011['compositeLayers', 'rendering', ''],1012['paintCount', 'rendering', ''],1013['heapDelta', 'memory', 'MB'],1014['heapDeltaPostGC', 'memory', 'MB'],1015['gcDurationMs', 'memory', 'ms'],1016['extHostHeapDelta', 'extHost', 'MB'],1017['extHostHeapDeltaPostGC', 'extHost', 'MB'],1018];1019const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']);10201021const lines = [];1022const scenarios = Object.keys(jsonReport.scenarios);10231024// -- Collect verdicts per scenario/metric --------------------------------1025/** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]>} */1026const scenarioVerdicts = new Map();1027let totalRegressions = 0;1028let totalImprovements = 0;10291030for (const scenario of scenarios) {1031const current = jsonReport.scenarios[scenario];1032const base = baseline?.scenarios?.[scenario];1033/** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */1034const verdicts = [];10351036if (base) {1037for (const [metric, group, unit] of allMetrics) {1038const cur = current[group]?.[metric];1039const bas = base[group]?.[metric];1040if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }10411042const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;1043const isRegressionMetric = regressionMetricNames.has(metric);10441045const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);1046const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);1047const ttest = welchTTest(basRaw, curRaw);1048const pStr = ttest ? `${ttest.pValue}` : 'n/a';10491050const metricThreshold = getMetricThreshold(opts, metric);1051const absoluteDelta = cur.median - bas.median;1052let verdict = '';1053if (isRegressionMetric) {1054if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {1055if (!ttest || ttest.significant) {1056verdict = 'REGRESSION';1057totalRegressions++;1058} else {1059verdict = 'noise';1060}1061} else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) {1062verdict = 'improved';1063totalImprovements++;1064} else {1065verdict = 'ok';1066}1067} else {1068verdict = 'info';1069}10701071const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;1072const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;1073verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr });1074}1075}1076scenarioVerdicts.set(scenario, verdicts);1077}10781079// -- Header with verdict up front ----------------------------------------1080const hasRegressions = totalRegressions > 0;1081const verdictIcon = hasRegressions ? '\u274C' : '\u2705';1082const verdictText = hasRegressions1083? `${totalRegressions} regression(s) detected`1084: totalImprovements > 01085? `No regressions \u2014 ${totalImprovements} improvement(s)`1086: 'No significant changes';10871088lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`);1089lines.push('');1090lines.push(`| | |`);1091lines.push(`|---|---|`);1092lines.push(`| **Baseline** | ${baseLink} |`);1093lines.push(`| **Test** | ${testLink} |`);1094if (compareLink) {1095lines.push(`| **Diff** | ${compareLink} |`);1096}1097lines.push(`| **Runs per scenario** | ${opts.runs} |`);1098const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => {1099const parsed = parseMetricThreshold(v);1100return parsed.type !== 'fraction' || parsed.value !== opts.threshold;1101});1102if (overrides.length > 0) {1103const overrideStr = overrides.map(([k, v]) => {1104const parsed = parseMetricThreshold(v);1105return `${k}: ${parsed.type === 'absolute' ? `${parsed.value}${k.includes('Ms') || k.includes('Time') || k.includes('time') ? 'ms' : ''}` : `${(parsed.value * 100).toFixed(0)}%`}`;1106}).join(', ');1107lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`);1108} else {1109lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);1110}1111lines.push(`| **Scenarios** | ${scenarios.length} |`);1112lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`);1113if (jsonReport.buildMode) {1114lines.push(`| **Build mode** | ${jsonReport.buildMode} |`);1115}1116lines.push('');1117if (jsonReport.mismatchedBuildMode) {1118lines.push('> **⚠ Build mode mismatch:** The test and baseline builds use different build modes.');1119lines.push('> Results may not be directly comparable. For apples-to-apples comparisons,');1120lines.push('> use the same build type for both (e.g. `--production-build` with a local');1121lines.push('> baseline path, or two version strings).');1122lines.push('');1123}11241125// -- At-a-glance overview table: one row per scenario --------------------1126lines.push(`## Overview`);1127lines.push('');1128lines.push('| Scenario | Description | TTFT | Complete | Layouts | Styles | LoAF | Verdict |');1129lines.push('|----------|-------------|-----:|---------:|--------:|-------:|-----:|:-------:|');11301131for (const scenario of scenarios) {1132const verdicts = scenarioVerdicts.get(scenario) || [];1133const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m);11341135const ttft = get('timeToFirstToken');1136const complete = get('timeToComplete');1137const layouts = get('layoutCount');1138const styles = get('recalcStyleCount');1139const loaf = get('longAnimationFrameCount');11401141const fmtCell = (/** @type {{ change: number, verdict: string } | undefined} */ v) => {1142if (!v) { return '\u2014'; }1143const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`;1144return pct;1145};11461147const fmtVerdict = (/** @type {{ verdict: string, change: number }[]} */ vs) => {1148const hasRegression = vs.some(v => v.verdict === 'REGRESSION');1149const hasImproved = vs.some(v => v.verdict === 'improved');1150if (hasRegression) { return '\u274C Regressed'; }1151if (hasImproved) { return '\u2B06\uFE0F Improved'; }1152return '\u2705 OK';1153};11541155const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean);1156const rowVerdict = fmtVerdict(/** @type {any[]} */(keyVerdicts));11571158lines.push(`| ${scenario} | ${getScenarioDescription(scenario)} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`);1159}1160lines.push('');11611162// -- Regressions & improvements detail section ---------------------------1163const hasNotable = [...scenarioVerdicts.values()].some(vs => vs.some(v => v.verdict === 'REGRESSION' || v.verdict === 'improved'));1164if (hasNotable) {1165lines.push('## Regressions & Improvements');1166lines.push('');1167lines.push('Only metrics that regressed or improved significantly are shown below.');1168lines.push('');11691170for (const scenario of scenarios) {1171const verdicts = scenarioVerdicts.get(scenario) || [];1172const notable = verdicts.filter(v => v.verdict === 'REGRESSION' || v.verdict === 'improved');1173if (notable.length === 0) { continue; }11741175const icon = notable.some(v => v.verdict === 'REGRESSION') ? '\u274C' : '\u2B06\uFE0F';1176lines.push(`### ${icon} ${scenario}`);1177lines.push('');1178lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |');1179lines.push('|--------|----------|------|--------|---------|---------|');1180for (const v of notable) {1181const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;1182const verdictIcon = v.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F';1183lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictIcon} ${v.verdict} |`);1184}1185lines.push('');1186}1187}11881189// -- Full metric tables in collapsible section ---------------------------1190lines.push('<details><summary>Full metric details per scenario</summary>');1191lines.push('');11921193for (const scenario of scenarios) {1194const verdicts = scenarioVerdicts.get(scenario) || [];1195const base = baseline?.scenarios?.[scenario];11961197lines.push(`### ${scenario}`);1198lines.push('');11991200if (!base) {1201const current = jsonReport.scenarios[scenario];1202lines.push('> No baseline data for this scenario.');1203lines.push('');1204lines.push('| Metric | Value | StdDev | CV | n |');1205lines.push('|--------|------:|-------:|---:|--:|');1206for (const [metric, group, unit] of allMetrics) {1207const cur = current[group]?.[metric];1208if (!cur) { continue; }1209lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`);1210}1211lines.push('');1212continue;1213}12141215lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`);1216lines.push(`|--------|----------|------|--------|---------|---------|`);12171218for (const v of verdicts) {1219const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;1220let verdictDisplay = v.verdict;1221if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; }1222else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; }1223else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; }1224else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; }1225else if (v.verdict === 'info') { verdictDisplay = '\u2139\uFE0F'; }1226lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`);1227}1228lines.push('');1229}1230lines.push('</details>');1231lines.push('');12321233// -- Raw run data in collapsible section ---------------------------------1234lines.push('<details><summary>Raw run data</summary>');1235lines.push('');1236for (const scenario of scenarios) {1237const current = jsonReport.scenarios[scenario];1238lines.push(`### ${scenario}`);1239lines.push('');1240lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');1241lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');1242const runs = current.rawRuns || [];1243for (let i = 0; i < runs.length; i++) {1244const r = runs[i];1245const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;1246lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);1247}1248lines.push('');1249}1250if (baseline) {1251for (const scenario of scenarios) {1252const base = baseline.scenarios?.[scenario];1253if (!base) { continue; }1254lines.push(`### ${scenario} (baseline)`);1255lines.push('');1256lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');1257lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');1258const runs = base.rawRuns || [];1259for (let i = 0; i < runs.length; i++) {1260const r = runs[i];1261const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;1262lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);1263}1264lines.push('');1265}1266}1267lines.push('</details>');1268lines.push('');12691270return lines.join('\n');1271}12721273// -- Cleanup on SIGINT/SIGTERM -----------------------------------------------12741275/** @type {{ close: () => Promise<void> } | null} */1276let activeVSCode = null;1277/** @type {{ close: () => Promise<void> } | null} */1278let activeMockServer = null;12791280function installSignalHandlers() {1281const cleanup = async () => {1282console.log('\n[chat-simulation] Caught interrupt, cleaning up...');1283try { await activeVSCode?.close(); } catch { }1284try { await activeMockServer?.close(); } catch { }1285process.exit(130);1286};1287process.on('SIGINT', cleanup);1288process.on('SIGTERM', cleanup);1289}12901291// -- Diagnostic cleanup ------------------------------------------------------12921293/**1294* Remove large diagnostic files (heap snapshots, CPU profiles, traces) from1295* a run's metrics to free disk space. Keeps the JSON results data intact.1296* @param {RunMetrics} metrics1297*/1298function cleanupRunDiagnostics(metrics) {1299const filesToDelete = [1300metrics.profilePath,1301metrics.tracePath,1302metrics.snapshotPath,1303metrics.extHostProfilePath,1304metrics.extHostSnapshotPath,1305];1306for (const filePath of filesToDelete) {1307if (filePath && fs.existsSync(filePath)) {1308try {1309fs.rmSync(filePath, { force: true });1310} catch {1311// Ignore cleanup errors1312}1313}1314}1315}13161317/**1318* Clean up diagnostics for all scenarios that did NOT regress.1319* Keeps diagnostics for regressed scenarios so they can be investigated.1320* @param {Record<string, RunMetrics[]>} allResults - test results by scenario1321* @param {Set<string>} regressedScenarios - scenarios that regressed1322*/1323function cleanupNonRegressedDiagnostics(allResults, regressedScenarios) {1324for (const [scenario, runs] of Object.entries(allResults)) {1325if (regressedScenarios.has(scenario)) {1326continue;1327}1328for (const metrics of runs) {1329cleanupRunDiagnostics(metrics);1330}1331}1332}13331334// -- Main --------------------------------------------------------------------13351336async function main() {1337registerPerfScenarios();1338const opts = parseArgs();13391340installSignalHandlers();13411342const { startServer } = require('./common/mock-llm-server');1343const mockServer = await startServer(0);1344activeMockServer = mockServer;1345console.log(`[chat-simulation] Mock LLM server: ${mockServer.url}`);13461347// -- Resume mode --------------------------------------------------------1348if (opts.resume) {1349if (!fs.existsSync(opts.resume)) {1350console.error(`[chat-simulation] Resume file not found: ${opts.resume}`);1351process.exit(1);1352}1353const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8'));1354const prevDir = path.dirname(opts.resume);13551356// Find the associated baseline JSON in the same directory1357const baselineFiles = fs.readdirSync(prevDir).filter((/** @type {string} */ f) => f.startsWith('baseline-') && f.endsWith('.json'));1358const baselineFile = baselineFiles.length > 0 ? path.join(prevDir, baselineFiles[0]) : null;1359const prevBaseline = baselineFile ? JSON.parse(fs.readFileSync(baselineFile, 'utf-8')) : null;13601361// Determine which scenarios to resume (default: all from previous run)1362const resumeScenarios = opts.scenarios.length > 01363? opts.scenarios.filter(s => prevResults.scenarios?.[s])1364: Object.keys(prevResults.scenarios || {});13651366if (resumeScenarios.length === 0) {1367console.error('[chat-simulation] No matching scenarios found in previous results');1368process.exit(1);1369}13701371const testElectron = await resolveBuild(opts.build);1372const baselineVersion = prevBaseline?.baselineBuildVersion;1373const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null;13741375const runsToAdd = opts.runs;1376console.log(`[chat-simulation] Resuming from: ${opts.resume}`);1377console.log(`[chat-simulation] Adding ${runsToAdd} runs per scenario`);1378console.log(`[chat-simulation] Scenarios: ${resumeScenarios.join(', ')}`);1379if (prevBaseline) {1380console.log(`[chat-simulation] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`);1381}1382console.log('');13831384for (const scenario of resumeScenarios) {1385console.log(`[chat-simulation] === Resuming: ${scenario} ===`);1386const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || [];1387const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || [];13881389// Run additional test iterations1390console.log(`[chat-simulation] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`);1391for (let i = 0; i < runsToAdd; i++) {1392const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`;1393console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);1394try {1395const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test', { ...opts.settingsOverrides, ...opts.testSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });1396// Clean up previous run's diagnostics to bound disk usage; keep the latest1397if (opts.cleanupDiagnostics && prevTestRuns.length > 0) { cleanupRunDiagnostics(prevTestRuns[prevTestRuns.length - 1]); }1398prevTestRuns.push(m);1399if (opts.verbose) {1400const src = m.hasInternalMarks ? 'internal' : 'client-side';1401console.log(` [${src}] firstToken=${m.timeToFirstToken}ms, complete=${m.timeToComplete}ms`);1402}1403} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }1404}14051406// Run additional baseline iterations1407if (baselineElectron && prevBaseline?.scenarios?.[scenario]) {1408console.log(`[chat-simulation] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`);1409for (let i = 0; i < runsToAdd; i++) {1410const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`;1411console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);1412try {1413const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline', { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });1414// Clean up previous run's diagnostics to bound disk usage; keep the latest1415if (opts.cleanupDiagnostics && prevBaseRuns.length > 0) { cleanupRunDiagnostics(prevBaseRuns[prevBaseRuns.length - 1]); }1416prevBaseRuns.push(m);1417} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }1418}1419}14201421// Recompute stats with merged data1422const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevTestRuns });1423for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); }1424prevResults.scenarios[scenario] = sd;14251426if (prevBaseline?.scenarios?.[scenario]) {1427const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevBaseRuns });1428for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); }1429prevBaseline.scenarios[scenario] = bsd;1430}1431console.log(`[chat-simulation] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`);1432console.log('');1433}14341435// Write updated files back1436prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs));1437prevResults.lastResumed = new Date().toISOString();1438fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2));1439console.log(`[chat-simulation] Updated results: ${opts.resume}`);14401441if (prevBaseline && baselineFile) {1442prevBaseline.lastResumed = new Date().toISOString();1443fs.writeFileSync(baselineFile, JSON.stringify(prevBaseline, null, 2));1444// Also update cached baseline1445const cachedPath = path.join(DATA_DIR, path.basename(baselineFile));1446fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2));1447console.log(`[chat-simulation] Updated baseline: ${baselineFile}`);1448}14491450// -- Re-run comparison with merged data --------------------------------1451opts.baseline = baselineFile || undefined;1452const jsonReport = prevResults;1453jsonReport._resultsPath = opts.resume;14541455// Fall through to comparison logic below1456await printComparison(jsonReport, opts);1457await mockServer.close();1458return;1459}14601461// -- Normal (non-resume) flow -------------------------------------------1462// --production-build: build a local bundled (non-dev) package from the1463// current source tree using `gulp vscode`. This produces the same1464// packaging as a release build (bundled JS, no VSCODE_DEV) while still1465// testing your local changes.1466if (opts.productionBuild && !opts.build) {1467const prodBuildPath = buildProductionBuild();1468opts.build = prodBuildPath;1469console.log(`[chat-simulation] --production-build: using local production build at ${prodBuildPath}`);1470}14711472const electronPath = await resolveBuild(opts.build);14731474if (!fs.existsSync(electronPath)) {1475console.error(`Electron not found at: ${electronPath}`);1476console.error('Run "node build/lib/preLaunch.ts" first, or pass --build <path>');1477process.exit(1);1478}14791480// Detect build modes for both test and baseline builds1481const testBuildMode = detectBuildMode(electronPath);14821483// Resolve the baseline build path early so we can detect its mode.1484// For version strings this downloads; for local paths it resolves directly.1485const isBaselineVersionString = opts.baselineBuild && isVersionString(opts.baselineBuild);1486const isBaselineLocalPath = opts.baselineBuild && !isBaselineVersionString;1487/** @type {string | undefined} */1488let baselineElectronPath;1489if (isBaselineLocalPath) {1490baselineElectronPath = await resolveBuild(opts.baselineBuild);1491if (!fs.existsSync(baselineElectronPath)) {1492console.error(`Baseline build not found at: ${baselineElectronPath}`);1493process.exit(1);1494}1495}1496const baselineBuildMode = opts.baselineBuild1497? (isBaselineVersionString ? 'release' : detectBuildMode(baselineElectronPath || ''))1498: undefined;14991500const isMismatchedBuildMode = baselineBuildMode !== undefined && testBuildMode !== baselineBuildMode;15011502// Create a timestamped run directory for all output1503const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);1504const runDir = path.join(DATA_DIR, runTimestamp);1505fs.mkdirSync(runDir, { recursive: true });1506console.log(`[chat-simulation] Output: ${runDir}`);15071508// Compute effective settings per role1509const testSettings = { ...opts.settingsOverrides, ...opts.testSettingsOverrides };1510const baselineSettings = { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides };15111512// -- Baseline build --------------------------------------------------1513if (opts.baselineBuild) {1514// Use a sanitized label for file names — replace path separators for local paths1515const baselineLabel = isBaselineLocalPath1516? path.basename(path.resolve(opts.baselineBuild))1517: opts.baselineBuild;1518const baselineJsonPath = path.join(runDir, `baseline-${baselineLabel}.json`);15191520// Local paths: always run fresh (no caching — the build may have changed)1521// Version strings: use caching as before1522const cachedPath = isBaselineLocalPath ? null : path.join(DATA_DIR, `baseline-${baselineLabel}.json`);1523const cachedBaseline = cachedPath && !opts.noCache && fs.existsSync(cachedPath)1524? JSON.parse(fs.readFileSync(cachedPath, 'utf-8'))1525: null;15261527if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) {1528// Check if the cache covers all requested scenarios1529const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {}));1530const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s));15311532// Also check if cached scenarios have fewer runs than requested1533const shortScenarios = opts.scenarios.filter((/** @type {string} */ s) => {1534const cached = cachedBaseline.scenarios?.[s];1535return cached && (cached.rawRuns?.length || 0) < opts.runs;1536});15371538if (missingScenarios.length === 0 && shortScenarios.length === 0) {1539console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`);1540fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));1541opts.baseline = baselineJsonPath;1542} else {1543const scenariosToRun = [...new Set([...missingScenarios, ...shortScenarios])];1544if (missingScenarios.length > 0) {1545console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`);1546}1547if (shortScenarios.length > 0) {1548console.log(`[chat-simulation] Cached baseline needs more runs for: ${shortScenarios.map((/** @type {string} */ s) => `${s} (${cachedBaseline.scenarios[s].rawRuns?.length || 0}/${opts.runs})`).join(', ')}`);1549}1550console.log(`[chat-simulation] Running baseline for ${scenariosToRun.length} scenario(s)...`);1551const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);1552for (const scenario of scenariosToRun) {1553const existingRuns = cachedBaseline.scenarios?.[scenario]?.rawRuns || [];1554const runsNeeded = opts.runs - existingRuns.length;1555/** @type {RunMetrics[]} */1556const newResults = [];1557for (let i = 0; i < runsNeeded; i++) {1558try {1559const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${existingRuns.length + i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });1560// Clean up previous run's diagnostics to bound disk usage; keep the latest1561if (opts.cleanupDiagnostics && newResults.length > 0) { cleanupRunDiagnostics(newResults[newResults.length - 1]); }1562newResults.push(m);1563}1564catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }1565}1566const allRuns = [...existingRuns, ...newResults];1567if (allRuns.length > 0) {1568const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: allRuns });1569for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); }1570cachedBaseline.scenarios[scenario] = sd;1571}1572}1573cachedBaseline.runsPerScenario = opts.runs;1574fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));1575if (cachedPath) {1576fs.writeFileSync(cachedPath, JSON.stringify(cachedBaseline, null, 2));1577}1578opts.baseline = baselineJsonPath;1579}1580} else {1581const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);1582console.log(`[chat-simulation] Benchmarking baseline build (${baselineLabel})...`);1583/** @type {Record<string, RunMetrics[]>} */1584const baselineResults = {};1585for (const scenario of opts.scenarios) {1586/** @type {RunMetrics[]} */1587const results = [];1588for (let i = 0; i < opts.runs; i++) {1589try {1590const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });1591// Clean up previous run's diagnostics to bound disk usage; keep the latest1592if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }1593results.push(m);1594}1595catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }1596}1597if (results.length > 0) { baselineResults[scenario] = results; }1598}1599const baselineReport = {1600timestamp: new Date().toISOString(),1601baselineBuildVersion: opts.baselineBuild,1602platform: process.platform,1603runsPerScenario: opts.runs,1604scenarios: /** @type {Record<string, any>} */ ({}),1605};1606for (const [scenario, results] of Object.entries(baselineResults)) {1607const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });1608for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }1609baselineReport.scenarios[scenario] = sd;1610}1611fs.writeFileSync(baselineJsonPath, JSON.stringify(baselineReport, null, 2));1612// Cache at the top level for reuse across runs (version strings only)1613if (cachedPath) {1614fs.writeFileSync(cachedPath, JSON.stringify(baselineReport, null, 2));1615}1616opts.baseline = baselineJsonPath;1617}1618console.log('');1619}16201621// -- Run benchmarks --------------------------------------------------1622console.log(`[chat-simulation] Electron: ${electronPath}`);1623console.log(`[chat-simulation] Build mode: ${buildModeLabel(testBuildMode)}`);1624if (baselineBuildMode) {1625console.log(`[chat-simulation] Baseline mode: ${buildModeLabel(baselineBuildMode)}`);1626}1627console.log(`[chat-simulation] Runs per scenario: ${opts.runs}`);1628console.log(`[chat-simulation] Scenarios: ${opts.scenarios.join(', ')}`);1629if (Object.keys(opts.settingsOverrides).length > 0) {1630console.log(`[chat-simulation] Settings overrides (all): ${JSON.stringify(opts.settingsOverrides)}`);1631}1632if (Object.keys(opts.testSettingsOverrides).length > 0) {1633console.log(`[chat-simulation] Settings overrides (test): ${JSON.stringify(opts.testSettingsOverrides)}`);1634}1635if (Object.keys(opts.baselineSettingsOverrides).length > 0) {1636console.log(`[chat-simulation] Settings overrides (baseline): ${JSON.stringify(opts.baselineSettingsOverrides)}`);1637}16381639if (isMismatchedBuildMode) {1640console.log('');1641console.log(`[chat-simulation] ⚠ WARNING: Build mode mismatch — test is ${testBuildMode}, baseline is ${baselineBuildMode}.`);1642console.log('[chat-simulation] Results may not be directly comparable. For apples-to-apples');1643console.log('[chat-simulation] comparisons, use the same build type for both.');1644if (testBuildMode === 'dev') {1645console.log('[chat-simulation] To use a local production build instead:');1646console.log('[chat-simulation] npm run perf:chat -- --production-build');1647}1648if (!opts.ci && !opts.force) {1649const readline = require('readline');1650const rl = readline.createInterface({ input: process.stdin, output: process.stdout });1651const answer = await new Promise(resolve => rl.question('[chat-simulation] Continue anyway? [y/N] ', resolve));1652rl.close();1653if (String(answer).toLowerCase() !== 'y') {1654console.log('[chat-simulation] Aborted.');1655await mockServer.close();1656process.exit(0);1657}1658}1659}1660console.log('');16611662/** @type {Record<string, RunMetrics[]>} */1663const allResults = {};1664let anyFailed = false;16651666for (const scenario of opts.scenarios) {1667console.log(`[chat-simulation] === Scenario: ${scenario} ===`);1668/** @type {RunMetrics[]} */1669const results = [];1670for (let i = 0; i < opts.runs; i++) {1671console.log(`[chat-simulation] Run ${i + 1}/${opts.runs}...`);1672try {1673const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test', testSettings, { heapSnapshots: opts.heapSnapshots });1674// Clean up previous run's diagnostics to bound disk usage; keep the latest1675if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }1676results.push(metrics);1677if (opts.verbose) {1678const src = metrics.hasInternalMarks ? 'internal' : 'client-side';1679console.log(` [${src}] firstToken=${metrics.timeToFirstToken}ms, complete=${metrics.timeToComplete}ms, heap=delta${metrics.heapDelta}MB, longTasks=${metrics.longTaskCount}${metrics.hasInternalMarks ? `, internalTTFT=${metrics.internalFirstToken}ms` : ''}`);1680}1681} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }1682}1683if (results.length === 0) { console.error(`[chat-simulation] All runs failed for scenario: ${scenario}`); anyFailed = true; }1684else { allResults[scenario] = results; }1685console.log('');1686}16871688// -- Summary ---------------------------------------------------------1689console.log('[chat-simulation] ======================= Summary =======================');1690for (const [scenario, results] of Object.entries(allResults)) {1691console.log('');1692console.log(` -- ${scenario} (${results.length} runs) --`);1693console.log('');1694console.log(' Timing:');1695console.log(summarize(results.map(r => r.timeToFirstToken), ' Request → First token ', 'ms'));1696console.log(summarize(results.map(r => r.timeToComplete), ' Request → Complete ', 'ms'));1697console.log(summarize(results.map(r => r.timeToRenderComplete), ' Request → Rendered ', 'ms'));1698console.log('');1699console.log(' Rendering:');1700console.log(summarize(results.map(r => r.layoutCount), ' Layouts ', ''));1701console.log(summarize(results.map(r => r.layoutDurationMs), ' Layout duration ', 'ms'));1702console.log(summarize(results.map(r => r.recalcStyleCount), ' Style recalcs ', ''));1703console.log(summarize(results.map(r => r.forcedReflowCount), ' Forced reflows ', ''));1704console.log(summarize(results.map(r => r.longTaskCount), ' Long tasks (>50ms) ', ''));1705console.log(summarize(results.map(r => r.longAnimationFrameCount), ' Long anim. frames ', ''));1706console.log(summarize(results.map(r => r.longAnimationFrameTotalMs), ' LoAF total duration ', 'ms'));1707console.log(summarize(results.map(r => r.frameCount), ' Frames ', ''));1708console.log(summarize(results.map(r => r.compositeLayers), ' Composite layers ', ''));1709console.log(summarize(results.map(r => r.paintCount), ' Paints ', ''));1710console.log('');1711console.log(' Memory:');1712console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB'));1713console.log(summarize(results.map(r => r.heapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));1714console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms'));1715if (results.some(r => r.extHostHeapDelta >= 0)) {1716console.log('');1717console.log(' Extension Host:');1718console.log(summarize(results.map(r => r.extHostHeapUsedBefore), ' Heap before ', 'MB'));1719console.log(summarize(results.map(r => r.extHostHeapUsedAfter), ' Heap after ', 'MB'));1720console.log(summarize(results.map(r => r.extHostHeapDelta), ' Heap delta ', 'MB'));1721console.log(summarize(results.map(r => r.extHostHeapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));1722}1723}17241725// -- JSON output -----------------------------------------------------1726const jsonPath = path.join(runDir, 'results.json');1727const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, buildMode: string, mismatchedBuildMode: boolean, scenarios: Record<string, any>, _resultsPath?: string }} */ ({1728timestamp: new Date().toISOString(),1729platform: process.platform,1730runsPerScenario: opts.runs,1731buildMode: testBuildMode,1732mismatchedBuildMode: !!isMismatchedBuildMode,1733scenarios: /** @type {Record<string, any>} */ ({}),1734});1735for (const [scenario, results] of Object.entries(allResults)) {1736const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });1737for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }1738jsonReport.scenarios[scenario] = sd;1739}1740fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2));1741jsonReport._resultsPath = jsonPath;1742console.log('');1743console.log(`[chat-simulation] Results written to ${jsonPath}`);17441745// -- Save baseline ---------------------------------------------------1746if (opts.saveBaseline) {1747if (!opts.baseline) { console.error('[chat-simulation] --save-baseline requires --baseline <path>'); process.exit(1); }1748fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2));1749console.log(`[chat-simulation] Baseline saved to ${opts.baseline}`);1750}17511752// -- Baseline comparison ---------------------------------------------1753const regressedScenarios = await printComparison(jsonReport, opts);17541755// Clean up diagnostics for scenarios that did not regress1756if (opts.cleanupDiagnostics) {1757cleanupNonRegressedDiagnostics(allResults, regressedScenarios);1758}17591760if (anyFailed) { process.exit(1); }1761await mockServer.close();1762}17631764/**1765* Print baseline comparison and exit with code 1 if regressions found.1766* Returns the set of scenario IDs that regressed.1767* @param {Record<string, any>} jsonReport1768* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, baseline?: string, ci?: boolean, resume?: string, build?: string, baselineBuild?: string, runs: number, cleanupDiagnostics?: boolean }} opts1769* @returns {Promise<Set<string>>}1770*/1771async function printComparison(jsonReport, opts) {1772let regressionFound = false;1773let inconclusiveFound = false;1774/** @type {Set<string>} */1775const regressedScenarios = new Set();1776if (opts.baseline && fs.existsSync(opts.baseline)) {1777const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'));1778console.log('');1779console.log(`[chat-simulation] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`);1780console.log(`[chat-simulation] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`);1781if (jsonReport.mismatchedBuildMode) {1782console.log(`[chat-simulation] ⚠ Note: build mode mismatch — test is ${jsonReport.buildMode}, baseline differs.`);1783console.log('[chat-simulation] Results may not be directly comparable.');1784}1785console.log('');17861787// Metrics that trigger regression failure when they exceed the threshold1788const regressionMetrics = [1789// [metric, group, unit]1790['timeToFirstToken', 'timing', 'ms'],1791['timeToComplete', 'timing', 'ms'],1792['layoutCount', 'rendering', ''],1793['recalcStyleCount', 'rendering', ''],1794['forcedReflowCount', 'rendering', ''],1795['longTaskCount', 'rendering', ''],1796];1797// Informational metrics — shown in comparison but don't trigger failure1798const infoMetrics = [1799['heapDelta', 'memory', 'MB'],1800['gcDurationMs', 'memory', 'ms'],1801['extHostHeapDelta', 'extHost', 'MB'],1802['extHostHeapDeltaPostGC', 'extHost', 'MB'],1803];18041805for (const scenario of Object.keys(jsonReport.scenarios)) {1806const current = jsonReport.scenarios[scenario];1807const base = baseline.scenarios?.[scenario];1808if (!base) { console.log(` ${scenario}: (no baseline)`); continue; }18091810/** @type {string[]} */1811const diffs = [];1812let scenarioRegression = false;18131814for (const [metric, group, unit] of regressionMetrics) {1815const cur = current[group]?.[metric];1816const bas = base[group]?.[metric];1817if (!cur || !bas || !bas.median) { continue; }1818const change = (cur.median - bas.median) / bas.median;1819const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;18201821// Statistical significance via Welch's t-test on raw run values1822const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);1823const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);1824const ttest = welchTTest(basRaw, curRaw);18251826const metricThreshold = getMetricThreshold(opts, metric);1827const absoluteDelta = cur.median - bas.median;1828let flag = '';1829if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {1830if (!ttest) {1831flag = ' ← possible regression (n too small for significance test)';1832inconclusiveFound = true;1833} else if (ttest.significant) {1834flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`;1835scenarioRegression = true;1836regressionFound = true;1837} else {1838flag = ` (likely noise — p=${ttest.pValue}, not significant)`;1839inconclusiveFound = true;1840}1841} else if (ttest && change > 0 && ttest.significant && ttest.confidence === 'high') {1842flag = ` (significant increase, p=${ttest.pValue})`;1843}1844diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct})${flag}`);1845}1846for (const [metric, group, unit] of infoMetrics) {1847const cur = current[group]?.[metric];1848const bas = base[group]?.[metric];1849if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }1850const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;1851const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;1852diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct}) [info]`);1853}1854console.log(` ${scenario}: ${scenarioRegression ? 'FAIL' : 'OK'}`);1855if (scenarioRegression) { regressedScenarios.add(scenario); }1856diffs.forEach(d => console.log(d));1857}18581859console.log('');1860console.log(regressionFound1861? `[chat-simulation] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance`1862: `[chat-simulation] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`);18631864if (inconclusiveFound && !regressionFound) {1865// Find the results.json path to suggest in the hint1866const resultsPath = Object.keys(jsonReport.scenarios).length > 01867? (jsonReport._resultsPath || opts.resume || 'path/to/results.json')1868: 'path/to/results.json';1869// Estimate required runs from the observed effect size and variance1870// using power analysis for Welch's t-test (alpha=0.05, 80% power).1871// n_per_group = 2 * ((z_alpha/2 + z_beta) / d)^2 where d = Cohen's d1872let maxNeeded = 0;1873for (const scenario of Object.keys(jsonReport.scenarios)) {1874const current = jsonReport.scenarios[scenario];1875const base = baseline.scenarios?.[scenario];1876if (!base) { continue; }1877for (const [metric, group] of [['timeToFirstToken', 'timing'], ['timeToComplete', 'timing'], ['layoutCount', 'rendering'], ['recalcStyleCount', 'rendering']]) {1878const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);1879const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);1880if (curRaw.length < 2 || basRaw.length < 2) { continue; }1881const meanA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / basRaw.length;1882const meanB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / curRaw.length;1883const varA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanA) ** 2, 0) / (basRaw.length - 1);1884const varB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanB) ** 2, 0) / (curRaw.length - 1);1885const pooledSD = Math.sqrt((varA + varB) / 2);1886if (pooledSD === 0) { continue; }1887const d = Math.abs(meanB - meanA) / pooledSD;1888if (d === 0) { continue; }1889// z_0.025 = 1.96, z_0.2 = 0.8421890const nPerGroup = Math.ceil(2 * ((1.96 + 0.842) / d) ** 2);1891const currentN = Math.min(curRaw.length, basRaw.length);1892maxNeeded = Math.max(maxNeeded, nPerGroup - currentN);1893}1894}1895const suggestedRuns = Math.max(1, Math.min(maxNeeded, 20));1896console.log('');1897console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.');1898console.log('[chat-simulation] To increase confidence, add more runs with --resume:');1899console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs ${suggestedRuns}`);1900}1901}19021903// -- CI summary ------------------------------------------------------1904if (opts.ci) {1905const ciBaseline = opts.baseline && fs.existsSync(opts.baseline)1906? JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'))1907: null;1908const summary = generateCISummary(jsonReport, ciBaseline, {1909threshold: opts.threshold,1910metricThresholds: opts.metricThresholds,1911runs: jsonReport.runsPerScenario || opts.runs,1912baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild,1913build: opts.build,1914});19151916// Write to file for GitHub Actions $GITHUB_STEP_SUMMARY1917const summaryPath = path.join(DATA_DIR, 'ci-summary.md');1918fs.writeFileSync(summaryPath, summary);1919console.log(`[chat-simulation] CI summary written to ${summaryPath}`);19201921// Also print the full summary table to stdout1922console.log('');1923console.log('==================================================================');1924console.log(' CHAT PERF COMPARISON RESULTS ');1925console.log('==================================================================');1926console.log('');1927console.log(summary);1928}19291930if (regressionFound) { process.exit(1); }1931return regressedScenarios;1932}19331934main().catch(err => { console.error(err); process.exit(1); });193519361937