Path: blob/main/scripts/chat-simulation/merge-ci-summary.js
13379 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45// @ts-check67/**8* Merge per-group perf results into a single unified CI summary.9*10* Called by the CI report job after all matrix groups have finished.11* Reads results.json and baseline-*.json from each group directory,12* merges all scenarios into one combined report, and writes a single13* ci-summary.md file.14*15* Usage:16* node scripts/chat-simulation/merge-ci-summary.js \17* --results-dir perf-results \18* --output ci-summary.md \19* [--leak-summary leak-results/.chat-simulation-data/ci-summary-leak.md] \20* [--threshold 0.2]21*/2223const fs = require('fs');24const path = require('path');25const { welchTTest, loadConfig } = require('./common/utils');2627// -- CLI args ----------------------------------------------------------------2829function parseArgs() {30const args = process.argv.slice(2);31const opts = {32resultsDir: '',33output: '',34/** @type {string | undefined} */35leakSummary: undefined,36threshold: 0.2,37/** @type {Record<string, number | string>} */38metricThresholds: {},39};40for (let i = 0; i < args.length; i++) {41switch (args[i]) {42case '--results-dir': opts.resultsDir = args[++i]; break;43case '--output': opts.output = args[++i]; break;44case '--leak-summary': opts.leakSummary = args[++i]; break;45case '--threshold': opts.threshold = parseFloat(args[++i]); break;46case '--help': case '-h':47console.log([48'Merge per-group perf results into a single CI summary.',49'',50'Options:',51' --results-dir <dir> Directory containing perf-results-* or perf-summary-* subdirs',52' --output <path> Output path for ci-summary.md',53' --leak-summary <path> Path to ci-summary-leak.md (optional)',54' --threshold <frac> Regression threshold fraction (default: 0.2)',55].join('\n'));56process.exit(0);57}58}59if (!opts.resultsDir || !opts.output) {60console.error('Required: --results-dir and --output');61process.exit(1);62}63return opts;64}6566// -- Merge logic -------------------------------------------------------------6768/**69* Find all results.json and baseline-*.json files across group directories,70* merge scenarios into a single combined report.71* @param {string} resultsDir72*/73function mergeResults(resultsDir) {74let groupDirs = fs.readdirSync(resultsDir)75.filter(d => d.startsWith('perf-results-') || d.startsWith('perf-summary-'))76.map(d => path.join(resultsDir, d))77.filter(d => fs.statSync(d).isDirectory());7879// Fallback: when download-artifact extracts a single artifact directly into80// resultsDir (no artifact-named subdirectory), treat resultsDir itself as the81// sole group directory if it contains a .chat-simulation-data folder.82if (groupDirs.length === 0) {83const simDataDir = path.join(resultsDir, '.chat-simulation-data');84if (fs.existsSync(simDataDir) && fs.statSync(simDataDir).isDirectory()) {85console.log(`No named subdirectories found; using ${resultsDir} directly as single group`);86groupDirs = [resultsDir];87} else {88console.error(`No perf-results-* or perf-summary-* directories found in ${resultsDir}`);89return null;90}91}9293/** @type {Record<string, any>} */94const mergedScenarios = {};95/** @type {Record<string, any>} */96const mergedBaselineScenarios = {};97let runsPerScenario = 0;98let platform = 'linux';99/** @type {string | undefined} */100let buildMode;101/** @type {string | undefined} */102let baselineBuildVersion;103/** @type {string | undefined} */104let threshold;105106// Read per-metric thresholds from config.jsonc (same source as the perf script)107const perfConfig = loadConfig('perfRegression');108/** @type {Record<string, number | string>} */109const metricThresholds = perfConfig.metricThresholds ?? {};110111for (const groupDir of groupDirs) {112// Find results.json (may be in a timestamped subdir under .chat-simulation-data)113const simDataDir = path.join(groupDir, '.chat-simulation-data');114if (!fs.existsSync(simDataDir)) { continue; }115116// Search for results.json in timestamped subdirs117const subdirs = fs.readdirSync(simDataDir).filter(d => {118const full = path.join(simDataDir, d);119return fs.statSync(full).isDirectory() && /^\d{4}-/.test(d);120});121122for (const subdir of subdirs) {123const resultsPath = path.join(simDataDir, subdir, 'results.json');124if (fs.existsSync(resultsPath)) {125const results = JSON.parse(fs.readFileSync(resultsPath, 'utf-8'));126runsPerScenario = results.runsPerScenario || runsPerScenario;127platform = results.platform || platform;128buildMode = results.buildMode || buildMode;129for (const [scenario, data] of Object.entries(results.scenarios || {})) {130mergedScenarios[scenario] = data;131}132}133134// Find baseline-*.json in the same dir135const baselineFiles = fs.readdirSync(path.join(simDataDir, subdir))136.filter(f => f.startsWith('baseline-') && f.endsWith('.json'));137for (const bf of baselineFiles) {138const baseline = JSON.parse(fs.readFileSync(path.join(simDataDir, subdir, bf), 'utf-8'));139baselineBuildVersion = baseline.baselineBuildVersion || baselineBuildVersion;140for (const [scenario, data] of Object.entries(baseline.scenarios || {})) {141mergedBaselineScenarios[scenario] = data;142}143}144}145146// Also check for baseline cached at top-level .chat-simulation-data147const topBaselines = fs.readdirSync(simDataDir)148.filter(f => f.startsWith('baseline-') && f.endsWith('.json'));149for (const bf of topBaselines) {150const baseline = JSON.parse(fs.readFileSync(path.join(simDataDir, bf), 'utf-8'));151baselineBuildVersion = baseline.baselineBuildVersion || baselineBuildVersion;152for (const [scenario, data] of Object.entries(baseline.scenarios || {})) {153mergedBaselineScenarios[scenario] = data;154}155}156157// Read threshold/metricThresholds from the group's ci-summary or config158const ciSummaryPath = path.join(simDataDir, 'ci-summary.md');159if (fs.existsSync(ciSummaryPath)) {160const content = fs.readFileSync(ciSummaryPath, 'utf-8');161const thresholdMatch = content.match(/Regression threshold\*\* \| (\d+)%/);162if (thresholdMatch) {163threshold = thresholdMatch[1];164}165}166}167168const mergedReport = {169timestamp: new Date().toISOString(),170platform,171runsPerScenario,172buildMode,173scenarios: mergedScenarios,174};175176const mergedBaseline = Object.keys(mergedBaselineScenarios).length > 0177? { baselineBuildVersion, scenarios: mergedBaselineScenarios }178: null;179180return { report: mergedReport, baseline: mergedBaseline, baselineBuildVersion, threshold: threshold ? parseInt(threshold, 10) / 100 : undefined, metricThresholds };181}182183// -- Summary generation (unified, single-header format) ----------------------184185const GITHUB_REPO = 'https://github.com/microsoft/vscode';186187/** @param {string} label */188function formatBuildLink(label) {189if (/^[0-9a-f]{7,40}$/.test(label)) {190return `[\`${label.substring(0, 7)}\`](${GITHUB_REPO}/commit/${label})`;191}192if (/^\d+\.\d+\.\d+/.test(label)) {193return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`;194}195return `\`${label}\``;196}197198/**199* @param {string} base200* @param {string} test201*/202function formatCompareLink(base, test) {203const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v);204if (!isRef(base) || !isRef(test)) { return ''; }205return `[compare](${GITHUB_REPO}/compare/${base}...${test})`;206}207208/**209* @param {{ type: string, value: number }} threshold210* @param {number} change211* @param {number} absoluteDelta212*/213function exceedsThreshold(threshold, change, absoluteDelta) {214if (threshold.type === 'absolute') { return absoluteDelta > threshold.value; }215return change > threshold.value;216}217218/**219* @param {{ threshold: number, metricThresholds?: Record<string, number | string> }} opts220* @param {string} metric221*/222function getMetricThreshold(opts, metric) {223const raw = opts.metricThresholds?.[metric];224if (raw !== undefined) {225const num = typeof raw === 'number' ? raw : parseFloat(/** @type {string} */(raw));226return typeof raw === 'number' ? { type: 'fraction', value: num } : { type: 'absolute', value: num };227}228return { type: 'fraction', value: opts.threshold };229}230231/** @param {number} v */232function round2(v) { return Math.round(v * 100) / 100; }233234/**235* Generate a unified Markdown summary for all scenarios.236*237* @param {Record<string, any>} jsonReport238* @param {Record<string, any> | null} baseline239* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, runs: number, baselineBuild?: string, build?: string, hasLeakFailure?: boolean }} opts240*/241function generateUnifiedSummary(jsonReport, baseline, opts) {242const baseLabel = opts.baselineBuild || 'baseline';243const testBuildMode = jsonReport.buildMode || 'dev';244const testLabel = testBuildMode === 'dev' ? 'dev (local)'245: testBuildMode === 'production' ? 'production (local)'246: opts.build || testBuildMode;247const baseLink = formatBuildLink(baseLabel);248const testLink = formatBuildLink(testLabel);249const compareLink = formatCompareLink(baseLabel, testLabel);250251const allMetrics = [252['timeToFirstToken', 'timing', 'ms'],253['timeToComplete', 'timing', 'ms'],254['layoutCount', 'rendering', ''],255['recalcStyleCount', 'rendering', ''],256['forcedReflowCount', 'rendering', ''],257['longTaskCount', 'rendering', ''],258['longAnimationFrameCount', 'rendering', ''],259['longAnimationFrameTotalMs', 'rendering', 'ms'],260['frameCount', 'rendering', ''],261['compositeLayers', 'rendering', ''],262['paintCount', 'rendering', ''],263['heapDelta', 'memory', 'MB'],264['heapDeltaPostGC', 'memory', 'MB'],265['gcDurationMs', 'memory', 'ms'],266['extHostHeapDelta', 'extHost', 'MB'],267['extHostHeapDeltaPostGC', 'extHost', 'MB'],268];269const regressionMetricNames = new Set([270'timeToFirstToken', 'timeToComplete', 'layoutCount', 'recalcStyleCount',271'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount',272]);273274const lines = [];275const scenarios = Object.keys(jsonReport.scenarios);276277// -- Collect verdicts ------------------------------------------------278/** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]>} */279const scenarioVerdicts = new Map();280let totalRegressions = 0;281let totalImprovements = 0;282283for (const scenario of scenarios) {284const current = jsonReport.scenarios[scenario];285const base = baseline?.scenarios?.[scenario];286/** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */287const verdicts = [];288289if (base) {290for (const [metric, group, unit] of allMetrics) {291const cur = current[group]?.[metric];292const bas = base[group]?.[metric];293if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }294295const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;296const isRegressionMetric = regressionMetricNames.has(metric);297298const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);299const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);300const ttest = welchTTest(basRaw, curRaw);301const pStr = ttest ? `${ttest.pValue}` : 'n/a';302303const metricThreshold = getMetricThreshold(opts, metric);304const absoluteDelta = cur.median - bas.median;305let verdict = '';306if (isRegressionMetric) {307if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {308if (!ttest || ttest.significant) {309verdict = 'REGRESSION';310totalRegressions++;311} else {312verdict = 'noise';313}314} else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) {315verdict = 'improved';316totalImprovements++;317} else {318verdict = 'ok';319}320} else {321verdict = 'info';322}323324const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;325const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;326verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr });327}328}329scenarioVerdicts.set(scenario, verdicts);330}331332// -- Header ----------------------------------------------------------333const hasRegressions = totalRegressions > 0;334const hasLeakFailure = !!opts.hasLeakFailure;335const hasFailed = hasRegressions || hasLeakFailure;336const verdictIcon = hasFailed ? '\u274C' : '\u2705';337const verdictParts = [];338if (hasRegressions && totalImprovements > 0) {339verdictParts.push(`${totalRegressions} regression(s), ${totalImprovements} improvement(s)`);340} else if (hasRegressions) {341verdictParts.push(`${totalRegressions} regression(s) detected`);342} else if (totalImprovements > 0) {343verdictParts.push(`No regressions \u2014 ${totalImprovements} improvement(s)`);344} else {345verdictParts.push('No significant changes');346}347if (hasLeakFailure) {348verdictParts.push('memory leak detected');349}350const verdictText = verdictParts.join('; ');351352lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`);353lines.push('');354lines.push(`| | |`);355lines.push(`|---|---|`);356lines.push(`| **Baseline** | ${baseLink} |`);357lines.push(`| **Test** | ${testLink} |`);358if (compareLink) {359lines.push(`| **Diff** | ${compareLink} |`);360}361lines.push(`| **Runs per scenario** | ${opts.runs} |`);362const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => {363const parsed = typeof v === 'number' ? { type: 'fraction', value: v } : { type: 'absolute', value: parseFloat(/** @type {string} */(v)) };364return parsed.type !== 'fraction' || parsed.value !== opts.threshold;365});366if (overrides.length > 0) {367const overrideStr = overrides.map(([k, v]) => {368if (typeof v === 'number') {369return `${k}: ${(v * 100).toFixed(0)}%`;370}371return `${k}: ${v}`;372}).join(', ');373lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`);374} else {375lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);376}377lines.push(`| **Scenarios** | ${scenarios.length} |`);378lines.push(`| **Platform** | ${jsonReport.platform || 'linux'} / x64 |`);379lines.push('');380381// -- Overview table --------------------------------------------------382lines.push('## Overview');383lines.push('');384lines.push('| Scenario | TTFT | Complete | Layouts | Styles | LoAF | Verdict |');385lines.push('|----------|-----:|---------:|--------:|-------:|-----:|:-------:|');386387for (const scenario of scenarios) {388const verdicts = scenarioVerdicts.get(scenario) || [];389const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m);390391const ttft = get('timeToFirstToken');392const complete = get('timeToComplete');393const layouts = get('layoutCount');394const styles = get('recalcStyleCount');395const loaf = get('longAnimationFrameCount');396397const fmtCell = (/** @type {{ change: number } | undefined} */ v) => {398if (!v) { return '\u2014'; }399return `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`;400};401402const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean);403const hasRegression = keyVerdicts.some(v => v?.verdict === 'REGRESSION');404const hasImproved = keyVerdicts.some(v => v?.verdict === 'improved');405const rowVerdict = hasRegression ? '\u274C' : hasImproved ? '\u2B06\uFE0F' : '\u2705';406407lines.push(`| ${scenario} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`);408}409lines.push('');410411// -- Regressions & improvements (compact table) ----------------------412const notableRows = [];413for (const scenario of scenarios) {414const verdicts = scenarioVerdicts.get(scenario) || [];415for (const v of verdicts) {416if (v.verdict === 'REGRESSION' || v.verdict === 'improved') {417notableRows.push({ scenario, ...v });418}419}420}421422if (notableRows.length > 0) {423lines.push('## Regressions & Improvements');424lines.push('');425426lines.push('| Scenario | Metric | Baseline | Test | Change | p-value | |');427lines.push('|----------|--------|----------|------|-------:|--------:|:-:|');428for (const r of notableRows) {429const pct = `${r.change > 0 ? '+' : ''}${(r.change * 100).toFixed(1)}%`;430const icon = r.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F';431lines.push(`| ${r.scenario} | ${r.metric} | ${r.basStr} | ${r.curStr} | ${pct} | ${r.pValue} | ${icon} |`);432}433lines.push('');434}435436// -- Full details (collapsible) --------------------------------------437lines.push('<details><summary>Full metric details per scenario</summary>');438lines.push('');439440for (const scenario of scenarios) {441const verdicts = scenarioVerdicts.get(scenario) || [];442const base = baseline?.scenarios?.[scenario];443444lines.push(`### ${scenario}`);445lines.push('');446447if (!base) {448const current = jsonReport.scenarios[scenario];449lines.push('> No baseline data for this scenario.');450lines.push('');451lines.push('| Metric | Value | StdDev | CV | n |');452lines.push('|--------|------:|-------:|---:|--:|');453for (const [metric, group, unit] of allMetrics) {454const cur = current[group]?.[metric];455if (!cur) { continue; }456lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`);457}458lines.push('');459continue;460}461462lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |');463lines.push('|--------|----------|------|--------|---------|---------|');464for (const v of verdicts) {465const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;466let verdictDisplay = v.verdict;467if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; }468else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; }469else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; }470else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; }471else if (v.verdict === 'info') { verdictDisplay = '\u2139\uFE0F'; }472lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`);473}474lines.push('');475}476lines.push('</details>');477lines.push('');478479// -- Raw run data (collapsible) --------------------------------------480lines.push('<details><summary>Raw run data</summary>');481lines.push('');482for (const scenario of scenarios) {483const current = jsonReport.scenarios[scenario];484lines.push(`### ${scenario}`);485lines.push('');486lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) |');487lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|');488const runs = current.rawRuns || [];489for (let i = 0; i < runs.length; i++) {490const r = runs[i];491lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${round2(r.longAnimationFrameTotalMs ?? 0) || '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} |`);492}493lines.push('');494}495if (baseline) {496for (const scenario of scenarios) {497const base = baseline.scenarios?.[scenario];498if (!base) { continue; }499lines.push(`### ${scenario} (baseline)`);500lines.push('');501lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) |');502lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|');503const runs = base.rawRuns || [];504for (let i = 0; i < runs.length; i++) {505const r = runs[i];506lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${round2(r.longAnimationFrameTotalMs ?? 0) || '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} |`);507}508lines.push('');509}510}511lines.push('</details>');512lines.push('');513514return lines.join('\n');515}516517// -- Main --------------------------------------------------------------------518519function main() {520const opts = parseArgs();521const merged = mergeResults(opts.resultsDir);522523if (!merged) {524const fallback = '\u26A0\uFE0F No perf results found to merge. Check perf-output.log artifacts.\n';525fs.writeFileSync(opts.output, fallback);526console.log('[merge] No results found.');527process.exit(0);528}529530const { report, baseline, baselineBuildVersion } = merged;531const scenarioCount = Object.keys(report.scenarios).length;532console.log(`[merge] Merged ${scenarioCount} scenarios from ${fs.readdirSync(opts.resultsDir).filter(d => d.startsWith('perf-results-') || d.startsWith('perf-summary-')).length} groups`);533if (baseline) {534console.log(`[merge] Baseline: ${baselineBuildVersion || 'unknown'} (${Object.keys(baseline.scenarios).length} scenarios)`);535}536537// Read leak summary early so we can reflect it in the header verdict538let leakSummaryContent = '';539let hasLeakFailure = false;540if (opts.leakSummary && fs.existsSync(opts.leakSummary)) {541leakSummaryContent = fs.readFileSync(opts.leakSummary, 'utf-8');542hasLeakFailure = leakSummaryContent.includes('\u274C');543console.log(`[merge] Leak summary found (failure: ${hasLeakFailure})`);544}545546const summary = generateUnifiedSummary(report, baseline, {547threshold: merged.threshold || opts.threshold,548metricThresholds: merged.metricThresholds,549runs: report.runsPerScenario,550baselineBuild: baselineBuildVersion,551build: process.env.TEST_COMMIT || undefined,552hasLeakFailure,553});554555// Append leak summary if available556let fullSummary = summary;557if (leakSummaryContent) {558fullSummary += '\n' + leakSummaryContent;559}560561fs.writeFileSync(opts.output, fullSummary);562console.log(`[merge] Summary written to ${opts.output}`);563}564565main();566567568