Path: blob/main/extensions/copilot/test/simulation/workbench/stores/amlResults.ts
13399 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import * as csvParse from 'csv-parse/sync';6import type * as vscode from 'vscode';7import { OutputAnnotation } from '../../shared/sharedTypes';8import { AMLRunKind } from './amlSimulations';910/** Copied from src/base/conversation/context/resolvers/gitRepository.ts */11interface RepoContext {12readonly name: string;13readonly headBranchName: string | undefined;14readonly upstreamBranchName: string | undefined;15readonly isRebasing: boolean;16readonly remotes: string[];17}1819/*20* Copied from src/base/conversation/context/promptContextModel.ts21* because workbench tsconfig restricts importing from `promptContextModel` for some reason.22*/23interface ISerializedWorkspaceState {24readonly workspaceFoldersFilePaths: string[] | undefined;25readonly activeTextEditor: {26selections: { anchor: vscode.Position; active: vscode.Position; isReversed: boolean }[];27documentFilePath: string;28visibleRanges: { start: vscode.Position; end: vscode.Position }[];29languageId: string;30} | undefined;31readonly symbols: {32name: string;33kind: vscode.SymbolKind;34containerName: string;35filePath: string;36start: vscode.Position;37end: vscode.Position;38}[] | undefined;39readonly notebookDocumentFilePaths: string[] | undefined;40readonly activeFileDiagnostics: { start: vscode.Position; end: vscode.Position; message: string; severity?: vscode.DiagnosticSeverity }[];41readonly debugConsoleOutput: string;42readonly repoContext: RepoContext | undefined;43readonly terminalBuffer: string;44readonly terminalSelection: string;45}4647/** Type for `response` field of `<experiment>_scored_predictions.csv` */48export type Response = {49originalFilePath: string;50fileBefore: string;51fileAfter: string;52logFileContents: string;53conversationFileContents: string;5455/**56* This needs to be of type `promptContextModel.ISerializedWorkspaceState`,57* but workbench tsconfig restricts importing from `promptContextModel` for some reason.58* We don't need this property for now, so we can leave it as an `object` for now. */59workspaceStateFileContents: ISerializedWorkspaceState;60};6162export type EvaluationError = {63startLine: number;64startColumn: number;65endLine: number;66endColumn: number;67message: string;68rule: string;69tool: string;70};7172export type TestRunEvaluation = {73caseName: string;74/** the n-th run for this case? 0-based */75nId: number;76languageId: string;77isSuccess: boolean;78errorsOnlyInBefore?: EvaluationError[];79errorsOnlyInAfter?: EvaluationError[];80annotations?: OutputAnnotation[];81stdout?: string;82stderr?: string;83evaluatorError?: string;84generatedTestCaseCount?: number;85generatedAssertCount?: number;86expectedDiff?: string;87};8889// parses each line in `<component>_scored_predictions.csv` into a TestRunEvaluation90function _parseScoredPredictionsCsv(kind: AMLRunKind, fileContents: string[]): TestRunEvaluation[] {91return fileContents.map((line, i) => {9293const json: any = JSON.parse(line); // may throw but not sure if we should have a way to recover9495let stdout: string | undefined;96let stderr: string | undefined;97let errorsOnlyInBefore: EvaluationError[] | undefined;98let errorsOnlyInAfter: EvaluationError[] | undefined;99const annotations: OutputAnnotation[] = [];100let evaluatorError: string | undefined;101let generatedTestCaseCount: number | undefined;102let generatedAssertCount: number | undefined;103104const extraDataJson = json.extra_data_json;105106if (extraDataJson) {107({ errorsOnlyInBefore, errorsOnlyInAfter } = _parseFixEvaluationData(kind, extraDataJson));108109[generatedTestCaseCount, generatedAssertCount] = _parseTestEvaluationData(kind, extraDataJson);110111stdout = extraDataJson.stdout && typeof extraDataJson.stdout === 'string' ? extraDataJson.stdout : undefined;112stderr = extraDataJson.stderr && typeof extraDataJson.stderr === 'string' ? extraDataJson.stderr : undefined;113}114115if (json.score !== 1) {116const statusCodes: string[] = json.status_codes;117118if (statusCodes) {119for (const statusCode of statusCodes) {120if (statusCode !== 'SUCCESS') {121annotations.push({ message: `AML eval error: ${statusCode}`, label: statusCode, severity: 'error' } satisfies OutputAnnotation);122}123}124125if (json.status_message) {126evaluatorError = evaluatorError ? `${evaluatorError}\n${json.status_message}` : json.status_message;127}128}129}130131return {132caseName: json.test_case_id,133nId: parseInt(json.n_id),134languageId: json.language,135isSuccess: typeof json.score === 'number' ? json.score === 1 : json.score,136errorsOnlyInBefore,137errorsOnlyInAfter,138annotations,139stdout,140stderr,141evaluatorError,142generatedTestCaseCount: generatedTestCaseCount,143generatedAssertCount: generatedAssertCount,144expectedDiff: extraDataJson?.diff,145};146});147}148149function _parseFixEvaluationData(kind: AMLRunKind, json: unknown) {150let errorsOnlyInBefore: EvaluationError[] | undefined;151let errorsOnlyInAfter: EvaluationError[] | undefined;152153if (kind === AMLRunKind.Fix && typeof json === 'object' && json) {154errorsOnlyInAfter = (json as any).errors_only_in_after?.map(_toEvaluationError).sort(_evaluationErrorComparator);155errorsOnlyInBefore = (json as any).errors_only_in_before?.map(_toEvaluationError).sort(_evaluationErrorComparator);156}157158return { errorsOnlyInBefore, errorsOnlyInAfter };159}160161function _parseTestEvaluationData(kind: AMLRunKind, json: unknown): [number | undefined, number | undefined] {162let generatedTestCaseCount: number | undefined = undefined;163let generatedAssertCount: number | undefined = undefined;164165if (kind === AMLRunKind.TestGen && typeof json === 'object' && json) {166if ('generated_test_case_count' in json) {167generatedTestCaseCount = json.generated_test_case_count as number;168}169if ('generated_assert_count' in json) {170generatedAssertCount = json.generated_assert_count as number;171}172}173174return [generatedTestCaseCount, generatedAssertCount];175}176177function _toEvaluationError(error: any): EvaluationError {178179return {180message: error.message,181rule: error.rule,182tool: error.tool,183startLine: error.start_line_index,184startColumn: error.start_col_index,185endLine: error.end_line_index,186endColumn: error.end_col_index187};188}189190function _evaluationErrorComparator(error1: EvaluationError, error2: EvaluationError) {191if (error1.startLine !== error2.startLine) {192return error1.startLine - error2.startLine;193}194if (error1.startColumn !== error2.startColumn) {195return error1.startColumn - error2.startColumn;196}197if (error1.endLine !== error2.endLine) {198return error1.endLine - error2.endLine;199}200if (error1.endColumn !== error2.endColumn) {201return error1.endColumn - error2.endColumn;202}203return 0;204}205206export type TestRunsEvaluation = {207caseName: string;208activeEditorLanguageId?: string;209isEachTestRunSuccess: boolean[];210errorsOnlyInBefore?: EvaluationError[];211errorsOnlyInAfter?: EvaluationError[];212annotations?: OutputAnnotation[];213stdout?: string;214stderr?: string;215evaluatorError?: string;216generatedTestCaseCount?: number;217generatedAssertCount?: number;218expectedDiff?: string;219};220221// parses lines in `<component>_scored_predictions.csv` and aggregates them into222// a format that is easier to use for our purposes223export function parseScoredPredictionsCsv(kind: AMLRunKind, fileContents: string[]): TestRunsEvaluation[] {224225const testRunEvals = _parseScoredPredictionsCsv(kind, fileContents);226227const testRunsEvaluation: TestRunsEvaluation[] = [];228229let ix = 0;230while (ix < testRunEvals.length) {231const { caseName, languageId, errorsOnlyInBefore, errorsOnlyInAfter, annotations, stdout, stderr, evaluatorError, generatedTestCaseCount, generatedAssertCount, expectedDiff } = testRunEvals[ix];232const isEachTestRunSuccess: boolean[] = [];233while (ix < testRunEvals.length && testRunEvals[ix].caseName === caseName) {234isEachTestRunSuccess.push(testRunEvals[ix].isSuccess);235ix++;236}237testRunsEvaluation.push({238caseName,239activeEditorLanguageId: languageId,240isEachTestRunSuccess,241errorsOnlyInBefore,242errorsOnlyInAfter,243annotations,244stdout,245stderr,246evaluatorError,247generatedTestCaseCount: generatedTestCaseCount,248generatedAssertCount: generatedAssertCount,249expectedDiff,250});251}252return testRunsEvaluation;253}254255export type ScoreCard = {256metric: string;257mean: number;258median: number;259stdErr: number;260confidenceInterval: [number, number];261count: number;262};263264export function parseScoreCard(fileContents: string): ScoreCard {265const scoreCardRows: ScoreCard[] = csvParse.parse(266fileContents,267{268delimiter: ',',269columns: ['metric', 'mean', 'median', 'stdErr', 'confidenceInterval', 'count'],270cast: (value: string, context: csvParse.CastingContext) => {271switch (context.column) {272case 'metric':273return value;274case 'mean':275return `${(parseFloat(value) * 100).toFixed(2)}%`;276case 'confidenceInterval': {277const unparenthesized = value.substring(1, value.length - 1);278const [lower, upper] = unparenthesized.split(', ').map(parseFloat);279return [lower, upper];280}281case 'count':282return parseInt(value);283default:284return parseFloat(value).toFixed(2);285}286},287fromLine: 2,288}289);290return scoreCardRows[0];291}292293export type ScoreCardByLanguage = {294language: string;295testCasesCount: number;296scoredCount: number;297unscoredCount: number;298meanScore: number;299};300301export function parseScoreCardByLanguage(fileContents: string): ScoreCardByLanguage[] {302return JSON.parse(fileContents).map((entry: any) => ({303language: entry.Language,304testCasesCount: entry.nTestCases,305scoredCount: entry.nScored,306unscoredCount: entry.nUnscored,307meanScore: entry.MeanScore308}));309}310311312