CoCalc -- amlResults.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/test/simulation/workbench/stores/amlResults.ts
¹³³⁹⁹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import * as csvParse from 'csv-parse/sync';
7
import type * as vscode from 'vscode';
8
import { OutputAnnotation } from '../../shared/sharedTypes';
9
import { AMLRunKind } from './amlSimulations';
10

11
/** Copied from src/base/conversation/context/resolvers/gitRepository.ts */
12
interface RepoContext {
13
	readonly name: string;
14
	readonly headBranchName: string | undefined;
15
	readonly upstreamBranchName: string | undefined;
16
	readonly isRebasing: boolean;
17
	readonly remotes: string[];
18
}
19

20
/*
21
 * Copied from src/base/conversation/context/promptContextModel.ts
22
 * because workbench tsconfig restricts importing from `promptContextModel` for some reason.
23
 */
24
interface ISerializedWorkspaceState {
25
	readonly workspaceFoldersFilePaths: string[] | undefined;
26
	readonly activeTextEditor: {
27
		selections: { anchor: vscode.Position; active: vscode.Position; isReversed: boolean }[];
28
		documentFilePath: string;
29
		visibleRanges: { start: vscode.Position; end: vscode.Position }[];
30
		languageId: string;
31
	} | undefined;
32
	readonly symbols: {
33
		name: string;
34
		kind: vscode.SymbolKind;
35
		containerName: string;
36
		filePath: string;
37
		start: vscode.Position;
38
		end: vscode.Position;
39
	}[] | undefined;
40
	readonly notebookDocumentFilePaths: string[] | undefined;
41
	readonly activeFileDiagnostics: { start: vscode.Position; end: vscode.Position; message: string; severity?: vscode.DiagnosticSeverity }[];
42
	readonly debugConsoleOutput: string;
43
	readonly repoContext: RepoContext | undefined;
44
	readonly terminalBuffer: string;
45
	readonly terminalSelection: string;
46
}
47

48
/** Type for `response` field of `<experiment>_scored_predictions.csv` */
49
export type Response = {
50
	originalFilePath: string;
51
	fileBefore: string;
52
	fileAfter: string;
53
	logFileContents: string;
54
	conversationFileContents: string;
55

56
	/**
57
	 * This needs to be of type `promptContextModel.ISerializedWorkspaceState`,
58
	 * but workbench tsconfig restricts importing from `promptContextModel` for some reason.
59
	 * We don't need this property for now, so we can leave it as an `object` for now. */
60
	workspaceStateFileContents: ISerializedWorkspaceState;
61
};
62

63
export type EvaluationError = {
64
	startLine: number;
65
	startColumn: number;
66
	endLine: number;
67
	endColumn: number;
68
	message: string;
69
	rule: string;
70
	tool: string;
71
};
72

73
export type TestRunEvaluation = {
74
	caseName: string;
75
	/** the n-th run for this case? 0-based */
76
	nId: number;
77
	languageId: string;
78
	isSuccess: boolean;
79
	errorsOnlyInBefore?: EvaluationError[];
80
	errorsOnlyInAfter?: EvaluationError[];
81
	annotations?: OutputAnnotation[];
82
	stdout?: string;
83
	stderr?: string;
84
	evaluatorError?: string;
85
	generatedTestCaseCount?: number;
86
	generatedAssertCount?: number;
87
	expectedDiff?: string;
88
};
89

90
// parses each line in `<component>_scored_predictions.csv` into a TestRunEvaluation
91
function _parseScoredPredictionsCsv(kind: AMLRunKind, fileContents: string[]): TestRunEvaluation[] {
92
	return fileContents.map((line, i) => {
93

94
		const json: any = JSON.parse(line); // may throw but not sure if we should have a way to recover
95

96
		let stdout: string | undefined;
97
		let stderr: string | undefined;
98
		let errorsOnlyInBefore: EvaluationError[] | undefined;
99
		let errorsOnlyInAfter: EvaluationError[] | undefined;
100
		const annotations: OutputAnnotation[] = [];
101
		let evaluatorError: string | undefined;
102
		let generatedTestCaseCount: number | undefined;
103
		let generatedAssertCount: number | undefined;
104

105
		const extraDataJson = json.extra_data_json;
106

107
		if (extraDataJson) {
108
			({ errorsOnlyInBefore, errorsOnlyInAfter } = _parseFixEvaluationData(kind, extraDataJson));
109

110
			[generatedTestCaseCount, generatedAssertCount] = _parseTestEvaluationData(kind, extraDataJson);
111

112
			stdout = extraDataJson.stdout && typeof extraDataJson.stdout === 'string' ? extraDataJson.stdout : undefined;
113
			stderr = extraDataJson.stderr && typeof extraDataJson.stderr === 'string' ? extraDataJson.stderr : undefined;
114
		}
115

116
		if (json.score !== 1) {
117
			const statusCodes: string[] = json.status_codes;
118

119
			if (statusCodes) {
120
				for (const statusCode of statusCodes) {
121
					if (statusCode !== 'SUCCESS') {
122
						annotations.push({ message: `AML eval error: ${statusCode}`, label: statusCode, severity: 'error' } satisfies OutputAnnotation);
123
					}
124
				}
125

126
				if (json.status_message) {
127
					evaluatorError = evaluatorError ? `${evaluatorError}\n${json.status_message}` : json.status_message;
128
				}
129
			}
130
		}
131

132
		return {
133
			caseName: json.test_case_id,
134
			nId: parseInt(json.n_id),
135
			languageId: json.language,
136
			isSuccess: typeof json.score === 'number' ? json.score === 1 : json.score,
137
			errorsOnlyInBefore,
138
			errorsOnlyInAfter,
139
			annotations,
140
			stdout,
141
			stderr,
142
			evaluatorError,
143
			generatedTestCaseCount: generatedTestCaseCount,
144
			generatedAssertCount: generatedAssertCount,
145
			expectedDiff: extraDataJson?.diff,
146
		};
147
	});
148
}
149

150
function _parseFixEvaluationData(kind: AMLRunKind, json: unknown) {
151
	let errorsOnlyInBefore: EvaluationError[] | undefined;
152
	let errorsOnlyInAfter: EvaluationError[] | undefined;
153

154
	if (kind === AMLRunKind.Fix && typeof json === 'object' && json) {
155
		errorsOnlyInAfter = (json as any).errors_only_in_after?.map(_toEvaluationError).sort(_evaluationErrorComparator);
156
		errorsOnlyInBefore = (json as any).errors_only_in_before?.map(_toEvaluationError).sort(_evaluationErrorComparator);
157
	}
158

159
	return { errorsOnlyInBefore, errorsOnlyInAfter };
160
}
161

162
function _parseTestEvaluationData(kind: AMLRunKind, json: unknown): [number | undefined, number | undefined] {
163
	let generatedTestCaseCount: number | undefined = undefined;
164
	let generatedAssertCount: number | undefined = undefined;
165

166
	if (kind === AMLRunKind.TestGen && typeof json === 'object' && json) {
167
		if ('generated_test_case_count' in json) {
168
			generatedTestCaseCount = json.generated_test_case_count as number;
169
		}
170
		if ('generated_assert_count' in json) {
171
			generatedAssertCount = json.generated_assert_count as number;
172
		}
173
	}
174

175
	return [generatedTestCaseCount, generatedAssertCount];
176
}
177

178
function _toEvaluationError(error: any): EvaluationError {
179

180
	return {
181
		message: error.message,
182
		rule: error.rule,
183
		tool: error.tool,
184
		startLine: error.start_line_index,
185
		startColumn: error.start_col_index,
186
		endLine: error.end_line_index,
187
		endColumn: error.end_col_index
188
	};
189
}
190

191
function _evaluationErrorComparator(error1: EvaluationError, error2: EvaluationError) {
192
	if (error1.startLine !== error2.startLine) {
193
		return error1.startLine - error2.startLine;
194
	}
195
	if (error1.startColumn !== error2.startColumn) {
196
		return error1.startColumn - error2.startColumn;
197
	}
198
	if (error1.endLine !== error2.endLine) {
199
		return error1.endLine - error2.endLine;
200
	}
201
	if (error1.endColumn !== error2.endColumn) {
202
		return error1.endColumn - error2.endColumn;
203
	}
204
	return 0;
205
}
206

207
export type TestRunsEvaluation = {
208
	caseName: string;
209
	activeEditorLanguageId?: string;
210
	isEachTestRunSuccess: boolean[];
211
	errorsOnlyInBefore?: EvaluationError[];
212
	errorsOnlyInAfter?: EvaluationError[];
213
	annotations?: OutputAnnotation[];
214
	stdout?: string;
215
	stderr?: string;
216
	evaluatorError?: string;
217
	generatedTestCaseCount?: number;
218
	generatedAssertCount?: number;
219
	expectedDiff?: string;
220
};
221

222
// parses lines in `<component>_scored_predictions.csv` and aggregates them into
223
// a format that is easier to use for our purposes
224
export function parseScoredPredictionsCsv(kind: AMLRunKind, fileContents: string[]): TestRunsEvaluation[] {
225

226
	const testRunEvals = _parseScoredPredictionsCsv(kind, fileContents);
227

228
	const testRunsEvaluation: TestRunsEvaluation[] = [];
229

230
	let ix = 0;
231
	while (ix < testRunEvals.length) {
232
		const { caseName, languageId, errorsOnlyInBefore, errorsOnlyInAfter, annotations, stdout, stderr, evaluatorError, generatedTestCaseCount, generatedAssertCount, expectedDiff } = testRunEvals[ix];
233
		const isEachTestRunSuccess: boolean[] = [];
234
		while (ix < testRunEvals.length && testRunEvals[ix].caseName === caseName) {
235
			isEachTestRunSuccess.push(testRunEvals[ix].isSuccess);
236
			ix++;
237
		}
238
		testRunsEvaluation.push({
239
			caseName,
240
			activeEditorLanguageId: languageId,
241
			isEachTestRunSuccess,
242
			errorsOnlyInBefore,
243
			errorsOnlyInAfter,
244
			annotations,
245
			stdout,
246
			stderr,
247
			evaluatorError,
248
			generatedTestCaseCount: generatedTestCaseCount,
249
			generatedAssertCount: generatedAssertCount,
250
			expectedDiff,
251
		});
252
	}
253
	return testRunsEvaluation;
254
}
255

256
export type ScoreCard = {
257
	metric: string;
258
	mean: number;
259
	median: number;
260
	stdErr: number;
261
	confidenceInterval: [number, number];
262
	count: number;
263
};
264

265
export function parseScoreCard(fileContents: string): ScoreCard {
266
	const scoreCardRows: ScoreCard[] = csvParse.parse(
267
		fileContents,
268
		{
269
			delimiter: ',',
270
			columns: ['metric', 'mean', 'median', 'stdErr', 'confidenceInterval', 'count'],
271
			cast: (value: string, context: csvParse.CastingContext) => {
272
				switch (context.column) {
273
					case 'metric':
274
						return value;
275
					case 'mean':
276
						return `${(parseFloat(value) * 100).toFixed(2)}%`;
277
					case 'confidenceInterval': {
278
						const unparenthesized = value.substring(1, value.length - 1);
279
						const [lower, upper] = unparenthesized.split(', ').map(parseFloat);
280
						return [lower, upper];
281
					}
282
					case 'count':
283
						return parseInt(value);
284
					default:
285
						return parseFloat(value).toFixed(2);
286
				}
287
			},
288
			fromLine: 2,
289
		}
290
	);
291
	return scoreCardRows[0];
292
}
293

294
export type ScoreCardByLanguage = {
295
	language: string;
296
	testCasesCount: number;
297
	scoredCount: number;
298
	unscoredCount: number;
299
	meanScore: number;
300
};
301

302
export function parseScoreCardByLanguage(fileContents: string): ScoreCardByLanguage[] {
303
	return JSON.parse(fileContents).map((entry: any) => ({
304
		language: entry.Language,
305
		testCasesCount: entry.nTestCases,
306
		scoredCount: entry.nScored,
307
		unscoredCount: entry.nUnscored,
308
		meanScore: entry.MeanScore
309
	}));
310
}
311

312
Product

Resources

Company