Path: blob/main/src/vs/workbench/contrib/notebook/common/services/notebookCellMatching.ts
3296 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { computeLevenshteinDistance } from '../../../../../base/common/diff/diff.js';6import { CellKind } from '../notebookCommon.js';789type EditCount = number;10type OriginalIndex = number;11type ModifiedIndex = number;12type CellEditCountCache = {13modifiedToOriginal: Map<ModifiedIndex, Map<OriginalIndex, { editCount: EditCount }>>;14originalToModified: Map<OriginalIndex, Map<ModifiedIndex, { editCount: EditCount }>>;15};1617type ICell = {18internalMetadata?: {19internalId?: string;20};21getValue(): string;22getLinesContent(): string[];23cellKind: CellKind;24};2526/**27* Given a set of modified cells and original cells, this function will attempt to match the modified cells with the original cells.28* E.g. Assume you have (original on left and modified on right):29* =================30* Cell A | Cell a31* Cell B | Cell b32* Cell C | Cell d33* Cell D | Cell e34* =================35* Here we know that `Cell C` has been removed and `Cell e` has been added.36* The mapping from modified to original will be as follows:37* Cell a => Cell A38* Cell b => Cell B39* Cell d => Cell D40* Cell e => <Does not match anything in original, hence a new Cell>41* Cell C in original was not matched, hence it was deleted.42*43* Thus the return value is as follows:44* [45* { modified: 0, original: 0 },46* { modified: 1, original: 1 },47* { modified: 2, original: 3 },48* { modified: 3, original: -1 },49* ]50* @returns51*/52export function matchCellBasedOnSimilarties(modifiedCells: ICell[], originalCells: ICell[]): { modified: number; original: number; percentage: number }[] {53const cache: CellEditCountCache = {54modifiedToOriginal: new Map<ModifiedIndex, Map<OriginalIndex, { editCount: EditCount }>>(),55originalToModified: new Map<OriginalIndex, Map<ModifiedIndex, { editCount: EditCount }>>(),56};57const results: { modified: number; original: number; dist: number; percentage: number; possibleOriginal: number }[] = [];58const mappedOriginalCellToModifiedCell = new Map<number, number>();59const mappedModifiedIndexes = new Set<number>();60const originalIndexWithMostEdits = new Map<number, { dist: number; modifiedIndex: number }>();61const canOriginalIndexBeMappedToModifiedIndex = (originalIndex: number, value: { editCount: EditCount }) => {62if (mappedOriginalCellToModifiedCell.has(originalIndex)) {63return false;64}65const existingEdits = originalIndexWithMostEdits.get(originalIndex)?.dist ?? Number.MAX_SAFE_INTEGER;66return value.editCount < existingEdits;67};68const trackMappedIndexes = (modifiedIndex: number, originalIndex: number) => {69mappedOriginalCellToModifiedCell.set(originalIndex, modifiedIndex);70mappedModifiedIndexes.add(modifiedIndex);71};7273for (let i = 0; i < modifiedCells.length; i++) {74const modifiedCell = modifiedCells[i];75const { index, editCount: dist, percentage } = computeClosestCell({ cell: modifiedCell, index: i }, originalCells, true, cache, canOriginalIndexBeMappedToModifiedIndex);76if (index >= 0 && dist === 0) {77trackMappedIndexes(i, index);78results.push({ modified: i, original: index, dist, percentage, possibleOriginal: index });79} else {80originalIndexWithMostEdits.set(index, { dist: dist, modifiedIndex: i });81results.push({ modified: i, original: -1, dist: dist, percentage, possibleOriginal: index });82}83}8485results.forEach((result, i) => {86if (result.original >= 0) {87return;88}8990/**91* I.e. Assume you have the following92* =================93* A a (this has ben matched)94* B b <not matched>95* C c <not matched>96* D d (these two have been matched)97* e e98* f f99* =================100* Just match A => a, B => b, C => c101*/102// Find the next cell that has been matched.103const previousMatchedCell = i > 0 ? results.slice(0, i).reverse().find(r => r.original >= 0) : undefined;104const previousMatchedOriginalIndex = previousMatchedCell?.original ?? -1;105const previousMatchedModifiedIndex = previousMatchedCell?.modified ?? -1;106const matchedCell = results.slice(i + 1).find(r => r.original >= 0);107const unavailableIndexes = new Set<number>();108const nextMatchedModifiedIndex = results.findIndex((item, idx) => idx > i && item.original >= 0);109const nextMatchedOriginalIndex = nextMatchedModifiedIndex >= 0 ? results[nextMatchedModifiedIndex].original : -1;110// Find the available indexes that we can match with.111// We are only interested in b and c (anything after d is of no use).112originalCells.forEach((_, i) => {113if (mappedOriginalCellToModifiedCell.has(i)) {114unavailableIndexes.add(i);115return;116}117if (matchedCell && i >= matchedCell.original) {118unavailableIndexes.add(i);119}120if (nextMatchedOriginalIndex >= 0 && i > nextMatchedOriginalIndex) {121unavailableIndexes.add(i);122}123});124125126const modifiedCell = modifiedCells[i];127/**128* I.e. Assume you have the following129* =================130* A a (this has ben matched)131* B b <not matched because the % of change is too high, but we do have a probable match>132* C c <not matched>133* D d (these two have been matched)134* e e135* f f136* =================137* Given that we have a probable match for B => b, we can match it.138*/139if (result.original === -1 && result.possibleOriginal >= 0 && !unavailableIndexes.has(result.possibleOriginal) && canOriginalIndexBeMappedToModifiedIndex(result.possibleOriginal, { editCount: result.dist })) {140trackMappedIndexes(i, result.possibleOriginal);141result.original = result.possibleOriginal;142return;143}144145146/**147* I.e. Assume you have the following148* =================149* A a (this has ben matched)150* B b <not matched>151* C c <not matched>152* D d (these two have been matched)153* =================154* Its possible that B matches better with c and C matches better with b.155* However given the fact that we have matched A => a and D => d.156* & if the indexes are an exact match.157* I.e. index of D in Modified === index of d in Original, and index of A in Modified === index of a in Original.158* Then this means there are absolutely no modifications.159* Hence we can just assign the indexes as is.160*161* NOTE: For this, we must ensure we have exactly the same number of items on either side.162* I.e. we have B, C remaining in Modified, and b, c remaining in Original.163* Thats 2 Modified items === 2 Original Items.164* If its not the same, then this means something has been deleted/inserted, and we cannot blindly map the indexes.165*/166if (previousMatchedOriginalIndex > 0 && previousMatchedModifiedIndex > 0 && previousMatchedOriginalIndex === previousMatchedModifiedIndex) {167if ((nextMatchedModifiedIndex >= 0 ? nextMatchedModifiedIndex : modifiedCells.length - 1) === (nextMatchedOriginalIndex >= 0 ? nextMatchedOriginalIndex : originalCells.length - 1) && !unavailableIndexes.has(i) && i < originalCells.length) {168const remainingModifiedItems = (nextMatchedModifiedIndex >= 0 ? nextMatchedModifiedIndex : modifiedCells.length) - previousMatchedModifiedIndex;169const remainingOriginalItems = (nextMatchedOriginalIndex >= 0 ? nextMatchedOriginalIndex : originalCells.length) - previousMatchedOriginalIndex;170if (remainingModifiedItems === remainingOriginalItems && modifiedCell.cellKind === originalCells[i].cellKind) {171trackMappedIndexes(i, i);172result.original = i;173return;174}175}176}177/**178* I.e. Assume you have the following179* =================180* A a (this has ben matched)181* B b <not matched>182* C c <not matched>183* D d (these two have been matched)184* e e185* f f186* =================187* We can now try to match B with b and c and figure out which is best.188* RULE 1. Its possible that B will match best with c, howevber C matches better with c, meaning we should match B with b.189* To do this, we need to see if c has a better match with something else.190*/191// RULE 1192// Try to find the next best match, but exclucde items that have a better match.193const { index, percentage } = computeClosestCell({ cell: modifiedCell, index: i }, originalCells, false, cache, (originalIndex: number, originalValue: { editCount: EditCount }) => {194if (unavailableIndexes.has(originalIndex)) {195return false;196}197198if (nextMatchedModifiedIndex > 0 || previousMatchedOriginalIndex > 0) {199// See if we have a beter match for this.200const matchesForThisOriginalIndex = cache.originalToModified.get(originalIndex);201if (matchesForThisOriginalIndex && previousMatchedOriginalIndex < originalIndex) {202const betterMatch = Array.from(matchesForThisOriginalIndex).find(([modifiedIndex, value]) => {203if (modifiedIndex === i) {204// This is the same modifeid entry.205return false;206}207if (modifiedIndex >= nextMatchedModifiedIndex) {208// We're only interested in matches that are before the next matched index.209return false;210}211if (mappedModifiedIndexes.has(i)) {212// This has already been matched.213return false;214}215return value.editCount < originalValue.editCount;216});217if (betterMatch) {218// We do have a better match for this, hence do not use this.219return false;220}221}222}223return !unavailableIndexes.has(originalIndex);224});225226/**227* I.e. Assume you have the following228* =================229* A a (this has ben matched)230* B bbbbbbbbbbbbbb <not matched>231* C cccccccccccccc <not matched>232* D d (these two have been matched)233* e e234* f f235* =================236* RULE 1 . Now when attempting to match `bbbbbbbbbbbb` with B, the number of edits is very high and the percentage is also very high.237* Basically majority of the text needs to be changed.238* However if the indexes line up perfectly well, and this is the best match, then use it.239*240* Similarly its possible we're trying to match b with `BBBBBBBBBBBB` and the number of edits is very high, but the indexes line up perfectly well.241*242* RULE 2. However it is also possible that there's a better match with another cell243* Assume we have244* =================245* AAAA a (this has been matched)246* bbbbbbbb b <not matched>247* bbbb c <not matched>248* dddd d (these two have been matched)249* =================250* In this case if we use the algorithm of (1) above, we'll end up matching bbbb with b, and bbbbbbbb with c.251* But we're not really sure if this is the best match.252* In such cases try to match with the same cell index.253*254*/255// RULE 1 (got a match and the indexes line up perfectly well, use it regardless of the number of edits).256if (index >= 0 && i > 0 && results[i - 1].original === index - 1) {257trackMappedIndexes(i, index);258results[i].original = index;259return;260}261262// RULE 2263// Here we know that `AAAA => a`264// Check if the previous cell has been matched.265// And if the next modified and next original cells are a match.266const nextOriginalCell = (i > 0 && originalCells.length > results[i - 1].original) ? results[i - 1].original + 1 : -1;267const nextOriginalCellValue = i > 0 && nextOriginalCell >= 0 && nextOriginalCell < originalCells.length ? originalCells[nextOriginalCell].getValue() : undefined;268if (index >= 0 && i > 0 && typeof nextOriginalCellValue === 'string' && !mappedOriginalCellToModifiedCell.has(nextOriginalCell)) {269if (modifiedCell.getValue().includes(nextOriginalCellValue) || nextOriginalCellValue.includes(modifiedCell.getValue())) {270trackMappedIndexes(i, nextOriginalCell);271results[i].original = nextOriginalCell;272return;273}274}275276if (percentage < 90 || (i === 0 && results.length === 1)) {277trackMappedIndexes(i, index);278results[i].original = index;279return;280}281});282283return results;284}285286function computeClosestCell({ cell, index: cellIndex }: { cell: ICell; index: number }, arr: readonly ICell[], ignoreEmptyCells: boolean, cache: CellEditCountCache, canOriginalIndexBeMappedToModifiedIndex: (originalIndex: number, value: { editCount: EditCount }) => boolean): { index: number; editCount: number; percentage: number } {287let min_edits = Infinity;288let min_index = -1;289290// Always give preference to internal Cell Id if found.291const internalId = cell.internalMetadata?.internalId;292if (internalId) {293const internalIdIndex = arr.findIndex(cell => cell.internalMetadata?.internalId === internalId);294if (internalIdIndex >= 0) {295return { index: internalIdIndex, editCount: 0, percentage: Number.MAX_SAFE_INTEGER };296}297}298299for (let i = 0; i < arr.length; i++) {300// Skip cells that are not of the same kind.301if (arr[i].cellKind !== cell.cellKind) {302continue;303}304const str = arr[i].getValue();305const cacheEntry = cache.modifiedToOriginal.get(cellIndex) ?? new Map<OriginalIndex, { editCount: EditCount }>();306const value = cacheEntry.get(i) ?? { editCount: computeNumberOfEdits(cell, arr[i]), };307cacheEntry.set(i, value);308cache.modifiedToOriginal.set(cellIndex, cacheEntry);309310const originalCacheEntry = cache.originalToModified.get(i) ?? new Map<ModifiedIndex, { editCount: EditCount }>();311originalCacheEntry.set(cellIndex, value);312cache.originalToModified.set(i, originalCacheEntry);313314if (!canOriginalIndexBeMappedToModifiedIndex(i, value)) {315continue;316}317if (str.length === 0 && ignoreEmptyCells) {318continue;319}320if (str === cell.getValue() && cell.getValue().length > 0) {321return { index: i, editCount: 0, percentage: 0 };322}323324if (value.editCount < min_edits) {325min_edits = value.editCount;326min_index = i;327}328}329330if (min_index === -1) {331return { index: -1, editCount: Number.MAX_SAFE_INTEGER, percentage: Number.MAX_SAFE_INTEGER };332}333const percentage = !cell.getValue().length && !arr[min_index].getValue().length ? 0 : (cell.getValue().length ? (min_edits * 100 / cell.getValue().length) : Number.MAX_SAFE_INTEGER);334return { index: min_index, editCount: min_edits, percentage };335}336337function computeNumberOfEdits(modified: ICell, original: ICell) {338if (modified.getValue() === original.getValue()) {339return 0;340}341342return computeLevenshteinDistance(modified.getValue(), original.getValue());343}344345346