Path: blob/main/extensions/copilot/src/platform/notebook/common/alternativeContentEditGenerator.ts
13401 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import type { CancellationToken, NotebookCell, NotebookDocument } from 'vscode';6import { isJupyterNotebookUri } from '../../../util/common/notebooks';7import { createServiceIdentifier } from '../../../util/common/services';8import { isUri } from '../../../util/common/types';9import { AsyncIterableObject, AsyncIterableSource, DeferredPromise } from '../../../util/vs/base/common/async';10import { StringSHA1 } from '../../../util/vs/base/common/hash';11import { Constants } from '../../../util/vs/base/common/uint';12import { EndOfLine, NotebookCellData, NotebookCellKind, NotebookEdit, NotebookRange, Range, TextEdit, Uri } from '../../../vscodeTypes';13import { IDiffService } from '../../diff/common/diffService';14import { ILogService } from '../../log/common/logService';15import { ITelemetryService } from '../../telemetry/common/telemetry';16import { AlternativeContentFormat, IAlternativeNotebookContentService } from './alternativeContent';17import { lineMightHaveCellMarker } from './alternativeContentProvider.text';18import { EOL, getCellId, getCellIdMap, LineOfText } from './helpers';19import { computeDiff } from './notebookDiff';2021export type NotebookEditGenerationTelemtryOptions = {22model: Promise<string> | string | undefined;23requestId: string | undefined;24source: NotebookEditGenrationSource;25};2627export enum NotebookEditGenrationSource {28codeMapperEditNotebook = 'codeMapperEditNotebook',29codeMapperEmptyNotebook = 'codeMapperEmptyNotebook',30codeMapperFastApply = 'codeMapperFastApply',31createFile = 'createFile',32stringReplace = 'stringReplace',33applyPatch = 'applyPatch',34newNotebookIntent = 'newNotebookIntent',35}3637export const IAlternativeNotebookContentEditGenerator = createServiceIdentifier<IAlternativeNotebookContentEditGenerator>('IAlternativeNotebookContentEditGenerator');38export interface IAlternativeNotebookContentEditGenerator {39readonly _serviceBrand: undefined;40generateNotebookEdits(notebookOrUri: NotebookDocument | Uri, lines: AsyncIterable<LineOfText> | string, telemetryOptions: NotebookEditGenerationTelemtryOptions | undefined, token: CancellationToken): AsyncIterable<NotebookEdit | [Uri, TextEdit[]]>;41}4243export class AlternativeNotebookContentEditGenerator implements IAlternativeNotebookContentEditGenerator {44declare readonly _serviceBrand: undefined;45constructor(46@IAlternativeNotebookContentService private readonly alternativeContentService: IAlternativeNotebookContentService,47@IDiffService private readonly diffService: IDiffService,48@ILogService private readonly logger: ILogService,49@ITelemetryService private readonly telemetryService: ITelemetryService,50) {51}5253private getFormat(firstLine: string): AlternativeContentFormat {54// if the source starts with `{` or `[`, then its a JSON string,55// If it starts with `<`, then its an XML string, else text56// Trim, as we want to ensure we remove any leading/trailing whitespace (e.g. its possible there's empty space between the fence and the content)57const firstChar = firstLine.trim().substring(0, 1);58const format = firstChar === '{' ? 'json' : firstChar === '<' ? 'xml' : 'text';59return format;60}6162/**63* Given a NotebookDocument or Uri, and a cell kind, return the EOL for the new cell.64* If the notebook is empty, then return the default EOL.65* Else default to the EOL of the first cell of the given kind.66* This way we have a consistent EOL for new cells (matching existing cells).67*/68private getEOLForNewCell(notebookOrUri: NotebookDocument | Uri, cellKind: NotebookCellKind): string | undefined {69const eolInExistingCodeCell = isUri(notebookOrUri) ? undefined : (notebookOrUri.getCells().find(c => c.kind === cellKind)?.document.eol ?? undefined);70return eolInExistingCodeCell ? eolInExistingCodeCell === EndOfLine.LF ? '\n' : '\r\n' : EOL;71}7273/**74* Given a stream of lines for the alternative content, generate the corresponding edits to apply to the notebook document.75* We accept a NotebookDocument or a Uri.76* This is because its possible the Notebook may not have been created/loaded as of yet.77* I.e. for new Notebooks, we can emity the Insert Cell Edits without the notebook being created.78*/79public async *generateNotebookEdits(notebookOrUri: NotebookDocument | Uri, lines: AsyncIterable<LineOfText> | string, telemetryOptions: NotebookEditGenerationTelemtryOptions | undefined, token: CancellationToken): AsyncIterable<NotebookEdit | [Uri, TextEdit[]]> {80lines = typeof lines === 'string' ? textToAsyncIterableLines(lines) : lines;81const firstNonEmptyLinePromise = new DeferredPromise<LineOfText>();82lines = readFirstNonEmptyLineAndKeepStreaming(lines, firstNonEmptyLinePromise);83const firstNonEmptyLine = (await firstNonEmptyLinePromise.p).value;84const format = this.getFormat(firstNonEmptyLine);8586// Sometimes llm hallucinates with jupytext format, and doesn't send the cell markers.87// Instead just sends plain python code.88// In such cases, if no new cells were emitted, then emit a new cell with the contents of the entire plain python code.89const linesCollected: string[] = [];90lines = collectWhileStreaming(lines, linesCollected);91const isEmptyNotebook = isUri(notebookOrUri) || notebookOrUri.cellCount === 0;9293let notebookEditEmitted = false;94let cellTextEditEmitted = false;95for await (const edit of this.generateNotebookEditsImpl(notebookOrUri, lines, format, token)) {96notebookEditEmitted = notebookEditEmitted || !Array.isArray(edit);97if (Array.isArray(edit)) {98cellTextEditEmitted = true;99}100yield edit;101}102103if (isEmptyNotebook || !isUri(notebookOrUri)) {104if (!notebookEditEmitted && format === 'text' && linesCollected.length && !lineMightHaveCellMarker(firstNonEmptyLine)) {105const uri = isUri(notebookOrUri) ? notebookOrUri : notebookOrUri.uri;106if (isJupyterNotebookUri(uri)) {107const eolForNewCell = this.getEOLForNewCell(notebookOrUri, NotebookCellKind.Code);108const cellData = new NotebookCellData(NotebookCellKind.Code, linesCollected.join(eolForNewCell), 'python');109yield NotebookEdit.insertCells(0, [cellData]);110this.logger.info(`No new cells were emitted for ${uri.toString()}. Emitting a new cell with the contents of the code.`);111} else {112this.logger.warn(`No new cells were emitted for ${uri.toString()}`);113}114}115}116117(async () => {118const model = await Promise.resolve(telemetryOptions?.model).catch(() => undefined);119/* __GDPR__120"notebook.editGeneration" : {121"owner": "donjayamanne",122"comment": "Metadata about the code mapper request",123"requestId": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "The id of the current request turn." },124"requestSource": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "The source from where the request was made" },125"model": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Model selection for the response" },126"inputFormat": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Input format for the notebook source (xml, json, text)" },127"isEmptyNotebook": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether the notebook is empty", "isMeasurement": true },128"isNotebookOrUri": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether we're given a notebook or just a uri (1 = Notebook, 0 = Uri)", "isMeasurement": true },129"isJupyterNotebookUri": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether we're given a Jupyter notebook or just a uri (1 = Jupyter Notebook, 0 = Other)", "isMeasurement": true },130"isEditEmitted": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether a Notebook edit was emitted (insert or delete cell) (1 = Yes, 0 = No)", "isMeasurement": true },131"isCellTextEditEmitted": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether an edit was emitted for a cell (1 = Yes, 0 = No)", "isMeasurement": true },132"sourceLength": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Number of lines in the source code from which we're to generate edits", "isMeasurement": true }133}134*/135this.telemetryService.sendMSFTTelemetryEvent('notebook.editGeneration', {136requestId: telemetryOptions?.requestId,137requestSource: telemetryOptions?.source,138model,139inputFormat: format140}, {141isEmptyNotebook: isEmptyNotebook ? 1 : 0,142isNotebookOrUri: isUri(notebookOrUri) ? 0 : 1,143isJupyterNotebookUri: isJupyterNotebookUri(isUri(notebookOrUri) ? notebookOrUri : notebookOrUri.uri) ? 1 : 0,144isEditEmitted: notebookEditEmitted ? 1 : 0,145isCellTextEditEmitted: cellTextEditEmitted ? 1 : 0,146sourceLength: linesCollected.length147});148})();149}150151public async *generateNotebookEditsImpl(notebookOrUri: NotebookDocument | Uri, lines: AsyncIterable<LineOfText>, format: AlternativeContentFormat, token: CancellationToken): AsyncIterable<NotebookEdit | [Uri, TextEdit[]]> {152const provider = this.alternativeContentService.create(format);153const isEmptyNotebook = isUri(notebookOrUri) || notebookOrUri.cellCount === 0;154const isNotebookAvailable = !isUri(notebookOrUri);155const cellIdMap = isNotebookAvailable ? getCellIdMap(notebookOrUri) : new Map<string, NotebookCell>();156157const cellInfo: { index: number; language: string; cell?: NotebookCell; lines: string[]; insertEdit?: NotebookEdit; ended: boolean } = {158index: -1,159lines: [],160language: 'markdown',161ended: false162};163164const cellsSeen = new WeakSet<NotebookCell>();165function getCellIdOfNewCell(cell: ExpectedCellInfo): string {166const hash = new StringSHA1();167hash.update(cell.index.toString());168return hash.digest().substring(0, 8);169}170171172// This tracks the order and content of the cells as they are expected to be in the notebook.173type ExpectedCellInfo = { index: number; cell?: NotebookCell; lines: string[]; language: string };174const expectedCells: ExpectedCellInfo[] = [];175const original: { id: string; uri?: Uri }[] = isUri(notebookOrUri) ? [] : notebookOrUri.getCells().map(cell => ({ id: getCellId(cell), uri: cell.document.uri }));176const allLines: string[] = [];177lines = collectWhileStreaming(lines, allLines);178let editsEmitted = false;179for await (const line of provider.parseAlternateContent(notebookOrUri, lines, token)) {180if (token.isCancellationRequested) {181break;182}183if (line.type === 'start') {184const expectedCell: ExpectedCellInfo = {185index: line.index,186language: line.language || 'markdown',187lines: [],188cell: line.id ? cellIdMap.get(line.id) : undefined189};190expectedCells.push(expectedCell);191cellInfo.ended = false;192cellInfo.insertEdit = undefined;193cellInfo.index = expectedCell.index;194cellInfo.lines = expectedCell.lines;195cellInfo.language = expectedCell.language;196cellInfo.cell = expectedCell.cell;197if (cellInfo.cell) {198cellsSeen.add(cellInfo.cell);199}200} else if (line.type === 'end') {201cellInfo.ended = true;202const doc = cellInfo.cell?.document;203if (!cellInfo.insertEdit && !cellInfo.cell && !cellInfo.lines.length) {204// This is a case where we have an empty cell.205// We do not get the line at all, but we only have a start and end,206// Meaning it is a cell, and it is well structured, but its empty.207const cellData = new NotebookCellData(cellInfo.language === 'markdown' ? NotebookCellKind.Markup : NotebookCellKind.Code, '', cellInfo.language);208const insertEdit = NotebookEdit.insertCells(cellInfo.index, [cellData]);209yield insertEdit;210editsEmitted = true;211original.splice(cellInfo.index, 0, { id: getCellIdOfNewCell(cellInfo) });212} else if (cellInfo.insertEdit && !cellInfo.cell) {213// Possible we got a cell from LLM that doesn't have an id, but matches the content of an existing cell.214// This can happen as follows:215// 1. User asks LLM to insert a cell216// 2. LLM returns a edit request to insert the cell without the cell id217// 3. We insert the cell218// 4. User asks for some other changes,219// 5. LLM uses history and see that the cell in history that doestn' have an id220// 6. LLM returns this same cell again along with other cells (new/changes, etc)221// 7. Some how SD endpoint cannot figure out this is the same cell, and SD returns this cell but without the id222// 8. Now we see this cell without an id, we insert it and we delete the old cell that was in this place.223// Solution: If the cell being inserted is the same as the cell that is already in the notebook in the same position, then don't insert it.224const existingCell = (!isEmptyNotebook && isNotebookAvailable && cellInfo.index < notebookOrUri.cellCount) ? notebookOrUri.cellAt(cellInfo.index) : undefined;225if (existingCell && existingCell.document.getText() === cellInfo.insertEdit.newCells[0].value) {226// Emit the edits for this cell.227// & do not insert this cell.228cellsSeen.add(existingCell);229expectedCells[expectedCells.length - 1].cell = existingCell;230231// Remit the edits for all the lines of this existing cell.232const doc = existingCell.document;233for (let i = 0; i < doc.lineCount; i++) {234const line = doc.lineAt(i);235yield [doc.uri, [new TextEdit(new Range(i, 0, i, Constants.MAX_SAFE_SMALL_INTEGER), line.text)]];236editsEmitted = true;237}238} else {239yield cellInfo.insertEdit;240editsEmitted = true;241original.splice(cellInfo.index, 0, { id: getCellIdOfNewCell(cellInfo) });242}243} else if (cellInfo.lines.length && doc && cellInfo.lines.length < doc.lineCount) {244const range = new Range(cellInfo.lines.length - 1, cellInfo.lines.slice(-1)[0].length, doc.lineCount - 1, doc.lineAt(doc.lineCount - 1).text.length);245yield [doc.uri, [new TextEdit(range, '')]];246}247} else if (line.type === 'line' && !cellInfo.ended) {248cellInfo.lines.push(line.line);249if (cellInfo.cell) {250if (cellInfo.lines.length > cellInfo.cell.document.lineCount) {251const range = new Range(cellInfo.lines.length - 1, 0, cellInfo.lines.length - 1, 0);252const eol = cellInfo.cell.document.eol === EndOfLine.LF ? '\n' : '\r\n';253const newText = `${eol}${line.line}`;254yield [cellInfo.cell.document.uri, [new TextEdit(range, newText)]];255} else {256const lineIndex = cellInfo.lines.length - 1;257yield [cellInfo.cell.document.uri, [new TextEdit(new Range(lineIndex, 0, lineIndex, Constants.MAX_SAFE_SMALL_INTEGER), line.line)]];258}259editsEmitted = true;260} else if (cellInfo.insertEdit) {261const eolForNewCell = this.getEOLForNewCell(notebookOrUri, cellInfo.insertEdit.newCells[0].kind);262cellInfo.insertEdit.newCells[0].value = cellInfo.lines.join(eolForNewCell);263} else {264// Insert the new cell.265const cellData = new NotebookCellData(cellInfo.language === 'markdown' ? NotebookCellKind.Markup : NotebookCellKind.Code, line.line, cellInfo.language);266cellInfo.insertEdit = NotebookEdit.insertCells(cellInfo.index, [cellData]);267}268}269}270271if (isEmptyNotebook || !isNotebookAvailable) {272return;273}274275// If we have content in the original notebook and no edits were emitted,276// But we have some content,277// This this can mean only one thing = invalid format.278// If the format is correct, then we should have emitted some edits.279// If we don't exit here we end up deleting all the cells in the notebook.280if (!editsEmitted && allLines.length) {281this.logger.warn(`No edits generated for notebook ${notebookOrUri.uri.toString()}. This is likely due to an invalid format. Expected format: ${format}. Provided content as follows:\n\n${allLines.join('\n')}`);282return;283}284285const modified = expectedCells.map(cell => cell.cell ? getCellId(cell.cell) : getCellIdOfNewCell(cell));286287// Delete the missing cells.288for (const missingCell of original.filter(cell => cell.uri && !modified.includes(cell.id)).reverse()) {289const cell = cellIdMap.get(missingCell.id);290if (cell) {291const index = original.indexOf(missingCell);292yield NotebookEdit.deleteCells(new NotebookRange(index, index + 1));293original.splice(index, 1);294}295}296297const result = await this.diffService.computeDiff(original.map(c => c.id).join(EOL), modified.join(EOL), { computeMoves: false, ignoreTrimWhitespace: true, maxComputationTimeMs: 5_000 });298const diffResult = computeDiff(original.map(i => i.id), modified, result.changes);299300if (diffResult.every(d => d.type === 'unchanged')) {301return;302}303304// Delete items305for (const change of diffResult.filter(d => d.type === 'delete').reverse()) {306yield NotebookEdit.deleteCells(new NotebookRange(change.originalCellIndex, change.originalCellIndex + 1));307}308309// insert items310for (const change of diffResult.filter(d => d.type === 'insert')) {311const expectedCell = expectedCells[change.modifiedCellIndex];312const kind = expectedCell.language === 'markdown' ? NotebookCellKind.Markup : NotebookCellKind.Code;313const eolForNewCell = this.getEOLForNewCell(notebookOrUri, kind);314const source = expectedCell.lines.join(eolForNewCell);315const cellData = new NotebookCellData(kind, source, expectedCell.language);316yield NotebookEdit.insertCells(expectedCell.index, [cellData]);317}318}319320}321322export function textToAsyncIterableLines(text: string): AsyncIterable<LineOfText> {323const source = new AsyncIterableSource<string>();324source.emitOne(text);325source.resolve();326return streamLines(source.asyncIterable);327}328329330/**331* Split an incoming stream of text to a stream of lines.332*/333function streamLines(source: AsyncIterable<string>): AsyncIterableObject<LineOfText> {334return new AsyncIterableObject<LineOfText>(async (emitter) => {335let buffer = '';336for await (const str of source) {337buffer += str;338do {339const newlineIndex = buffer.indexOf('\n');340if (newlineIndex === -1) {341break;342}343344// take the first line345const line = buffer.substring(0, newlineIndex);346buffer = buffer.substring(newlineIndex + 1);347348emitter.emitOne(new LineOfText(line));349} while (true);350}351352if (buffer.length > 0) {353// last line which doesn't end with \n354emitter.emitOne(new LineOfText(buffer));355}356});357}358359360function readFirstNonEmptyLineAndKeepStreaming(source: AsyncIterable<LineOfText>, firstNonEmptyLine: DeferredPromise<LineOfText>): AsyncIterable<LineOfText> {361return new AsyncIterableObject<LineOfText>(async (emitter) => {362for await (const line of source) {363if (!firstNonEmptyLine.isSettled && line.value.trim().length) {364firstNonEmptyLine.complete(line);365}366emitter.emitOne(line);367}368if (!firstNonEmptyLine.isSettled) {369firstNonEmptyLine.complete(new LineOfText(''));370}371});372}373374function collectWhileStreaming(source: AsyncIterable<LineOfText>, lines: string[]): AsyncIterable<LineOfText> {375return new AsyncIterableObject<LineOfText>(async (emitter) => {376for await (const line of source) {377lines.push(line.value);378emitter.emitOne(line);379}380});381}382383