Path: blob/main/extensions/copilot/script/analyzeEdits.ts
13383 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { promises as fs } from 'fs';6import * as path from 'path';7import * as readline from 'readline';89// Edit tool names we're tracking10const EDIT_TOOL_NAMES = ['insert_edit_into_file', 'replace_string_in_file', 'multi_replace_string_in_file', 'apply_patch'];1112// Tool names that indicate a continuation/retry attempt13const CONTINUATION_TOOL_NAMES = ['read_file'];1415interface ToolCall {16tool: string;17input_tokens?: number;18cached_input_tokens?: number;19output_tokens?: number;20response: string | string[];21edits?: Array<{22path: string;23edits: {24replacements: Array<{25replaceRange: { start: number; endExclusive: number };26newText: string;27}>;28};29}>;30}3132interface EditOperation {33toolName: string;34timestamp: string;35success: boolean;36filePath?: string;37turnIndex: number;38isRetry: boolean;39retrySucceeded?: boolean;40}4142interface ConversationAnalysis {43conversationPath: string;44edits: EditOperation[];45totalEdits: number;46successfulEdits: number;47failedEdits: number;48successfulEditsWithRetries: number;49totalUniqueEdits: number;50modelName?: string;51}5253interface RunAnalysis {54runId: string;55conversations: ConversationAnalysis[];56totalEdits: number;57successRate: number;58successRateWithRetries: number;59totalUniqueEdits: number;60modelName?: string;61}6263async function listRuns(amlOutPath: string): Promise<string[]> {64const entries = await fs.readdir(amlOutPath, { withFileTypes: true });65// Filter directories that are numeric run IDs66const runs = entries67.filter(e => e.isDirectory() && /^\d+$/.test(e.name))68.map(e => e.name)69.sort((a, b) => parseInt(b) - parseInt(a)); // Sort descending (newest first)70return runs;71}7273async function promptUserForRun(runs: string[]): Promise<string> {74console.log('\nAvailable test runs (newest first):');75runs.slice(0, 10).forEach((run, i) => {76console.log(` ${i + 1}. ${run}`);77});78if (runs.length > 10) {79console.log(` ... and ${runs.length - 10} more`);80}8182const rl = readline.createInterface({83input: process.stdin,84output: process.stdout85});8687return new Promise((resolve) => {88rl.question('\nEnter run number (or press Enter for the most recent): ', (answer) => {89rl.close();90const choice = answer.trim();91if (choice === '') {92resolve(runs[0]);93} else {94const index = parseInt(choice) - 1;95if (index >= 0 && index < runs.length) {96resolve(runs[index]);97} else {98console.log('Invalid selection, using most recent run.');99resolve(runs[0]);100}101}102});103});104}105106async function analyzeConversation(conversationPath: string): Promise<ConversationAnalysis> {107const trajectoryPath = path.join(conversationPath, 'trajectories', 'trajectory.json');108109let toolCalls: ToolCall[] = [];110let modelName: string | undefined;111112try {113const content = await fs.readFile(trajectoryPath, 'utf-8');114toolCalls = JSON.parse(content);115} catch (error) {116console.warn(`Could not read trajectory file: ${trajectoryPath}`);117return {118conversationPath,119edits: [],120totalEdits: 0,121successfulEdits: 0,122failedEdits: 0,123successfulEditsWithRetries: 0,124totalUniqueEdits: 0125};126}127128const edits: EditOperation[] = [];129let turnIndex = 0;130131for (let i = 0; i < toolCalls.length; i++) {132const toolCall = toolCalls[i];133134if (!EDIT_TOOL_NAMES.includes(toolCall.tool)) {135continue;136}137138// Determine success based on response139const response = Array.isArray(toolCall.response) ? toolCall.response[0] : toolCall.response;140const success = typeof response === 'string' && response.includes('successfully edited');141142// Get file path from edits if available143const filePath = toolCall.edits && toolCall.edits.length > 0144? toolCall.edits[0].path145: undefined;146147// Detect retry pattern: failed edit -> continuation tool -> another edit148let isRetry = false;149let retrySucceeded: boolean | undefined;150151if (!success) {152// Look ahead to see if there's a continuation tool followed by another edit153let j = i + 1;154let foundContinuationTool = false;155while (j < toolCalls.length && j < i + 10) { // Look ahead max 10 calls156if (CONTINUATION_TOOL_NAMES.includes(toolCalls[j].tool)) {157foundContinuationTool = true;158} else if (foundContinuationTool && EDIT_TOOL_NAMES.includes(toolCalls[j].tool)) {159// Found a retry!160isRetry = true;161const retryResponse = Array.isArray(toolCalls[j].response)162? toolCalls[j].response[0]163: toolCalls[j].response;164retrySucceeded = typeof retryResponse === 'string' && retryResponse.includes('successfully edited');165break;166} else if (EDIT_TOOL_NAMES.includes(toolCalls[j].tool)) {167// Another edit without continuation tool in between, not a retry168break;169}170j++;171}172}173174edits.push({175toolName: toolCall.tool,176timestamp: new Date().toISOString(), // Trajectory doesn't have timestamps, use current time177success,178filePath,179turnIndex: turnIndex++,180isRetry,181retrySucceeded182});183}184185const successfulEdits = edits.filter(e => e.success).length;186187// Calculate success rate accounting for retries (final outcome only)188const editsWithRetries = edits.filter(e => !e.success && e.isRetry);189const retriedSuccesses = editsWithRetries.filter(e => e.retrySucceeded).length;190const successfulEditsWithRetries = successfulEdits + retriedSuccesses;191const totalUniqueEdits = edits.length - editsWithRetries.length + editsWithRetries.filter(e => e.retrySucceeded !== undefined).length;192193return {194conversationPath,195edits,196totalEdits: edits.length,197successfulEdits,198failedEdits: edits.length - successfulEdits,199successfulEditsWithRetries,200totalUniqueEdits,201modelName202};203}204205async function analyzeRun(runId: string, basePath: string): Promise<RunAnalysis> {206const runPath = path.join(basePath, runId);207208const conversations: ConversationAnalysis[] = [];209210try {211const entries = await fs.readdir(runPath, { withFileTypes: true });212213for (const entry of entries) {214if (entry.isDirectory()) {215const conversationPath = path.join(runPath, entry.name);216const analysis = await analyzeConversation(conversationPath);217if (analysis.totalEdits > 0) {218conversations.push(analysis);219}220}221}222} catch (error) {223console.error(`Error reading run directory: ${error}`);224}225226const totalEdits = conversations.reduce((sum, c) => sum + c.totalEdits, 0);227const totalSuccessful = conversations.reduce((sum, c) => sum + c.successfulEdits, 0);228const totalSuccessfulWithRetries = conversations.reduce((sum, c) => sum + c.successfulEditsWithRetries, 0);229const totalUniqueEdits = conversations.reduce((sum, c) => sum + c.totalUniqueEdits, 0);230231// Get model name from first conversation that has one232const modelName = conversations.find(c => c.modelName)?.modelName;233234return {235runId,236conversations,237totalEdits,238successRate: totalEdits > 0 ? totalSuccessful / totalEdits : 0,239successRateWithRetries: totalUniqueEdits > 0 ? totalSuccessfulWithRetries / totalUniqueEdits : 0,240totalUniqueEdits,241modelName242};243}244245function generateHTML(analysis: RunAnalysis, outputPath: string, includeRetries: boolean = false): string {246// Build Sankey data247const sankeyNodes: string[] = [];248const sankeyLinks: Array<{ source: number; target: number; value: number }> = [];249250const nodeMap = new Map<string, number>();251252const getNodeIndex = (name: string): number => {253if (!nodeMap.has(name)) {254nodeMap.set(name, sankeyNodes.length);255sankeyNodes.push(name);256}257return nodeMap.get(name)!;258};259260// Track flows261const flows = new Map<string, number>();262263for (const conv of analysis.conversations) {264for (const edit of conv.edits) {265const toolNode = edit.toolName;266267// Check if this is a failed edit with a retry268if (includeRetries && !edit.success && edit.isRetry && edit.retrySucceeded !== undefined) {269// Show full retry flow: Tool -> Failed -> read_file -> Retry Edit -> Final Result270const failedNode = 'Failed (will retry)';271const readFileNode = 'read_file';272const retryEditNode = `${toolNode} (retry)`;273const finalResult = edit.retrySucceeded ? 'Success' : 'Failed';274275flows.set(`${toolNode}->${failedNode}`, (flows.get(`${toolNode}->${failedNode}`) || 0) + 1);276flows.set(`${failedNode}->${readFileNode}`, (flows.get(`${failedNode}->${readFileNode}`) || 0) + 1);277flows.set(`${readFileNode}->${retryEditNode}`, (flows.get(`${readFileNode}->${retryEditNode}`) || 0) + 1);278flows.set(`${retryEditNode}->${finalResult}`, (flows.get(`${retryEditNode}->${finalResult}`) || 0) + 1);279continue;280}281282// Tool -> Success/Fail283const resultNode = edit.success ? 'Success' : 'Failed';284const flowKey = `${toolNode}->${resultNode}`;285flows.set(flowKey, (flows.get(flowKey) || 0) + 1);286}287}288289// Convert flows to Sankey links290for (const [flowKey, count] of flows.entries()) {291const [source, target] = flowKey.split('->');292sankeyLinks.push({293source: getNodeIndex(source),294target: getNodeIndex(target),295value: count296});297}298299// Build table rows300const tableRows = analysis.conversations.flatMap(conv =>301conv.edits.map(edit => ({302conversation: path.basename(conv.conversationPath),303toolName: edit.toolName,304timestamp: edit.timestamp,305success: edit.success,306turnIndex: edit.turnIndex,307isRetry: edit.isRetry,308retrySucceeded: edit.retrySucceeded,309filePath: edit.filePath310}))311);312313const html = `<!DOCTYPE html>314<html lang="en">315<head>316<meta charset="UTF-8">317<meta name="viewport" content="width=device-width, initial-scale=1.0">318<title>Run ${analysis.runId}${analysis.modelName ? ' - ' + analysis.modelName : ''}</title>319<script src="https://unpkg.com/d3@7/dist/d3.min.js"></script>320<script src="https://unpkg.com/[email protected]/dist/d3-sankey.min.js"></script>321<style>322* {323box-sizing: border-box;324}325326body {327font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;328margin: 0;329padding: 20px;330background: #f5f5f5;331color: #333;332}333334.container {335max-width: 1400px;336margin: 0 auto;337background: white;338padding: 30px;339border-radius: 8px;340box-shadow: 0 2px 8px rgba(0,0,0,0.1);341}342343h1 {344margin: 0 0 10px 0;345color: #1a1a1a;346}347348.stats {349display: grid;350grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));351gap: 15px;352margin: 20px 0;353}354355.stat-card {356background: #f8f9fa;357padding: 15px;358border-radius: 6px;359border-left: 4px solid #0969da;360}361362.stat-label {363font-size: 12px;364text-transform: uppercase;365color: #666;366margin-bottom: 5px;367}368369.stat-value {370font-size: 24px;371font-weight: 600;372color: #1a1a1a;373}374375.controls {376margin: 20px 0;377padding: 15px;378background: #f8f9fa;379border-radius: 6px;380}381382.controls label {383display: inline-flex;384align-items: center;385cursor: pointer;386font-size: 14px;387}388389.controls input[type="checkbox"] {390margin-right: 8px;391width: 18px;392height: 18px;393cursor: pointer;394}395396#sankey-diagram {397margin: 30px 0;398overflow-x: auto;399}400401.table-container {402margin-top: 30px;403overflow-x: auto;404}405406table {407width: 100%;408border-collapse: collapse;409font-size: 14px;410}411412thead {413background: #f8f9fa;414}415416th {417text-align: left;418padding: 12px;419font-weight: 600;420color: #1a1a1a;421border-bottom: 2px solid #dee2e6;422}423424td {425padding: 10px 12px;426border-bottom: 1px solid #dee2e6;427}428429tbody tr:hover {430background: #f8f9fa;431}432433.badge {434display: inline-block;435padding: 3px 8px;436border-radius: 12px;437font-size: 12px;438font-weight: 500;439}440441.badge-success {442background: #d1f4e0;443color: #0f6d31;444}445446.badge-failed {447background: #ffd8d8;448color: #d1242f;449}450451.sankey-node rect {452cursor: pointer;453fill-opacity: 0.9;454}455456.sankey-node rect:hover {457fill-opacity: 1;458}459460.sankey-link {461fill: none;462stroke-opacity: 0.3;463}464465.sankey-link:hover {466stroke-opacity: 0.5;467}468469.sankey-node text {470pointer-events: none;471font-size: 12px;472fill: #1a1a1a;473}474</style>475</head>476<body>477<div class="container">478<h1>🔧 Run ${analysis.runId}${analysis.modelName ? ' - ' + analysis.modelName : ''}</h1>479<p style="color: #666; margin: 5px 0 0 0;">Analysis of edit tool operations and success rates</p>480481<div class="stats">482<div class="stat-card">483<div class="stat-label">Total Edits</div>484<div class="stat-value">${analysis.totalEdits}</div>485</div>486<div class="stat-card" style="border-left-color: #2da44e;">487<div class="stat-label">Success Rate</div>488<div class="stat-value" id="success-rate-value">${(analysis.successRate * 100).toFixed(1)}%</div>489</div>490<div class="stat-card" style="border-left-color: #8250df;">491<div class="stat-label">Conversations</div>492<div class="stat-value">${analysis.conversations.length}</div>493</div>494</div>495496<div class="controls">497<label>498<input type="checkbox" id="includeRetries" ${includeRetries ? 'checked' : ''}>499Include retries (show re-evaluate → retry flows)500</label>501</div>502503<div id="sankey-diagram"></div>504505<h2 style="margin-top: 40px;">Edit Operations</h2>506<div class="table-container">507<table>508<thead>509<tr>510<th>Conversation</th>511<th>Tool</th>512<th>Turn</th>513<th>File</th>514<th>Status</th>515<th>Retry</th>516</tr>517</thead>518<tbody>519${tableRows.map(row => `520<tr>521<td>${row.conversation}</td>522<td><code style="background: #f6f8fa; padding: 2px 6px; border-radius: 3px; font-size: 12px;">${row.toolName}</code></td>523<td>${row.turnIndex}</td>524<td style="color: #666; font-size: 12px; max-width: 300px; overflow: hidden; text-overflow: ellipsis;">${row.filePath || '-'}</td>525<td><span class="badge ${row.success ? 'badge-success' : 'badge-failed'}">${row.success ? '✓ Success' : '✗ Failed'}</span></td>526<td>${row.isRetry ? (row.retrySucceeded === true ? '<span class="badge badge-success">✓ Retry Success</span>' : row.retrySucceeded === false ? '<span class="badge badge-failed">✗ Retry Failed</span>' : '<span class="badge" style="background: #e3e3e3; color: #666;">Retry Pending</span>') : '-'}</td>527</tr>528`).join('')}529</tbody>530</table>531</div>532</div>533534<script>535const sankeyData = {536nodes: ${JSON.stringify(sankeyNodes.map(name => ({ name })))},537links: ${JSON.stringify(sankeyLinks)}538};539const analysisData = {540successRate: ${analysis.successRate},541successRateWithRetries: ${analysis.successRateWithRetries},542totalEdits: ${analysis.totalEdits},543totalUniqueEdits: ${analysis.totalUniqueEdits}544};545546function drawSankey(includeRetries) {547// Clear previous diagram548d3.select('#sankey-diagram').html('');549550// Rebuild data based on includeRetries flag551const allEdits = ${JSON.stringify(tableRows)};552const nodes = [];553const links = [];554const nodeMap = new Map();555556const getNodeIndex = (name) => {557if (!nodeMap.has(name)) {558nodeMap.set(name, nodes.length);559nodes.push({ name });560}561return nodeMap.get(name);562};563564const flows = new Map();565566for (const edit of allEdits) {567const toolNode = edit.toolName;568569// Check if this is a failed edit with a retry570if (includeRetries && !edit.success && edit.isRetry && edit.retrySucceeded !== undefined) {571// Show full retry flow572const failedNode = 'Failed (will retry)';573const readFileNode = 'read_file';574const retryEditNode = toolNode + ' (retry)';575const finalResult = edit.retrySucceeded ? 'Success' : 'Failed';576577flows.set(toolNode + '->' + failedNode, (flows.get(toolNode + '->' + failedNode) || 0) + 1);578flows.set(failedNode + '->' + readFileNode, (flows.get(failedNode + '->' + readFileNode) || 0) + 1);579flows.set(readFileNode + '->' + retryEditNode, (flows.get(readFileNode + '->' + retryEditNode) || 0) + 1);580flows.set(retryEditNode + '->' + finalResult, (flows.get(retryEditNode + '->' + finalResult) || 0) + 1);581continue;582}583584const resultNode = edit.success ? 'Success' : 'Failed';585const flowKey = toolNode + '->' + resultNode;586flows.set(flowKey, (flows.get(flowKey) || 0) + 1);587}588589for (const [flowKey, count] of flows.entries()) {590const [source, target] = flowKey.split('->');591links.push({592source: getNodeIndex(source),593target: getNodeIndex(target),594value: count595});596}597598const width = Math.max(800, document.getElementById('sankey-diagram').offsetWidth);599const height = 500;600601const svg = d3.select('#sankey-diagram')602.append('svg')603.attr('width', width)604.attr('height', height);605606const sankey = d3.sankey()607.nodeWidth(15)608.nodePadding(10)609.extent([[1, 1], [width - 1, height - 5]]);610611const graph = sankey({612nodes: nodes.map(d => Object.assign({}, d)),613links: links.map(d => Object.assign({}, d))614});615616const colorScale = d3.scaleOrdinal()617.domain(['replace_string_in_file', 'multi_replace_string_in_file', 'read_file', 'Failed (will retry)', 'Success', 'Failed'])618.range(['#0969da', '#8250df', '#a855f7', '#ff9800', '#2da44e', '#d1242f']);619620// Links621svg.append('g')622.attr('class', 'links')623.selectAll('path')624.data(graph.links)625.enter()626.append('path')627.attr('class', 'sankey-link')628.attr('d', d3.sankeyLinkHorizontal())629.attr('stroke', d => colorScale(d.source.name))630.attr('stroke-width', d => Math.max(1, d.width));631632// Nodes633const node = svg.append('g')634.attr('class', 'nodes')635.selectAll('g')636.data(graph.nodes)637.enter()638.append('g')639.attr('class', 'sankey-node');640641node.append('rect')642.attr('x', d => d.x0)643.attr('y', d => d.y0)644.attr('height', d => d.y1 - d.y0)645.attr('width', d => d.x1 - d.x0)646.attr('fill', d => colorScale(d.name))647.append('title')648.text(d => d.name + '\\n' + d.value + ' edits');649650node.append('text')651.attr('x', d => d.x0 < width / 2 ? d.x1 + 6 : d.x0 - 6)652.attr('y', d => (d.y1 + d.y0) / 2)653.attr('dy', '0.35em')654.attr('text-anchor', d => d.x0 < width / 2 ? 'start' : 'end')655.text(d => d.name + ' (' + d.value + ')');656}657658// Initial draw659drawSankey(${includeRetries});660661// Update success rate display662function updateSuccessRate(includeRetries) {663const rate = includeRetries ? analysisData.successRateWithRetries : analysisData.successRate;664document.getElementById('success-rate-value').textContent = (rate * 100).toFixed(1) + '%';665}666667// Handle checkbox change668document.getElementById('includeRetries').addEventListener('change', (e) => {669drawSankey(e.target.checked);670updateSuccessRate(e.target.checked);671});672673// Redraw on window resize674let resizeTimer;675window.addEventListener('resize', () => {676clearTimeout(resizeTimer);677resizeTimer = setTimeout(() => {678const includeRetries = document.getElementById('includeRetries').checked;679drawSankey(includeRetries);680}, 250);681});682</script>683</body>684</html>`;685686return html;687}688689async function main() {690const args = process.argv.slice(2);691const runIdArg = args.find(arg => arg.startsWith('--runId='));692693const basePath = path.join('/Users/connor/Github/vscode-copilot-evaluation/.msbenchRun');694695let runId: string;696697if (runIdArg) {698runId = runIdArg.split('=')[1];699console.log(`Using run ID: ${runId}`);700} else {701const runs = await listRuns(basePath);702if (runs.length === 0) {703console.error('No test runs found in', basePath);704process.exit(1);705}706runId = await promptUserForRun(runs);707console.log(`Selected run: ${runId}`);708}709710console.log('\nAnalyzing run...');711const analysis = await analyzeRun(runId, basePath);712713console.log(`\nFound ${analysis.conversations.length} conversations with edits`);714console.log(`Total edits: ${analysis.totalEdits}`);715console.log(`Success rate: ${(analysis.successRate * 100).toFixed(1)}%`);716717const outputPath = path.join(basePath, runId, 'edit-analysis.html');718const html = generateHTML(analysis, outputPath);719720await fs.writeFile(outputPath, html, 'utf-8');721console.log(`\n✓ Analysis complete! Generated: ${outputPath}`);722}723724main().catch(console.error);725726727