CoCalc -- notebookCellMatching.ts

GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/workbench/contrib/notebook/common/services/notebookCellMatching.ts
³²⁹⁶ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import { computeLevenshteinDistance } from '../../../../../base/common/diff/diff.js';
7
import { CellKind } from '../notebookCommon.js';
8

9

10
type EditCount = number;
11
type OriginalIndex = number;
12
type ModifiedIndex = number;
13
type CellEditCountCache = {
14
	modifiedToOriginal: Map<ModifiedIndex, Map<OriginalIndex, { editCount: EditCount }>>;
15
	originalToModified: Map<OriginalIndex, Map<ModifiedIndex, { editCount: EditCount }>>;
16
};
17

18
type ICell = {
19
	internalMetadata?: {
20
		internalId?: string;
21
	};
22
	getValue(): string;
23
	getLinesContent(): string[];
24
	cellKind: CellKind;
25
};
26

27
/**
28
 * Given a set of modified cells and original cells, this function will attempt to match the modified cells with the original cells.
29
 * E.g. Assume you have (original on left and modified on right):
30
 * =================
31
 * Cell A  | Cell a
32
 * Cell B  | Cell b
33
 * Cell C  | Cell d
34
 * Cell D  | Cell e
35
 * =================
36
 * Here we know that `Cell C` has been removed and `Cell e` has been added.
37
 * The mapping from modified to original will be as follows:
38
 * Cell a => Cell A
39
 * Cell b => Cell B
40
 * Cell d => Cell D
41
 * Cell e => <Does not match anything in original, hence a new Cell>
42
 * Cell C in original was not matched, hence it was deleted.
43
 *
44
 * Thus the return value is as follows:
45
 * [
46
 * { modified: 0, original: 0 },
47
 * { modified: 1, original: 1 },
48
 * { modified: 2, original: 3 },
49
 * { modified: 3, original: -1 },
50
 * ]
51
 * @returns
52
 */
53
export function matchCellBasedOnSimilarties(modifiedCells: ICell[], originalCells: ICell[]): { modified: number; original: number; percentage: number }[] {
54
	const cache: CellEditCountCache = {
55
		modifiedToOriginal: new Map<ModifiedIndex, Map<OriginalIndex, { editCount: EditCount }>>(),
56
		originalToModified: new Map<OriginalIndex, Map<ModifiedIndex, { editCount: EditCount }>>(),
57
	};
58
	const results: { modified: number; original: number; dist: number; percentage: number; possibleOriginal: number }[] = [];
59
	const mappedOriginalCellToModifiedCell = new Map<number, number>();
60
	const mappedModifiedIndexes = new Set<number>();
61
	const originalIndexWithMostEdits = new Map<number, { dist: number; modifiedIndex: number }>();
62
	const canOriginalIndexBeMappedToModifiedIndex = (originalIndex: number, value: { editCount: EditCount }) => {
63
		if (mappedOriginalCellToModifiedCell.has(originalIndex)) {
64
			return false;
65
		}
66
		const existingEdits = originalIndexWithMostEdits.get(originalIndex)?.dist ?? Number.MAX_SAFE_INTEGER;
67
		return value.editCount < existingEdits;
68
	};
69
	const trackMappedIndexes = (modifiedIndex: number, originalIndex: number) => {
70
		mappedOriginalCellToModifiedCell.set(originalIndex, modifiedIndex);
71
		mappedModifiedIndexes.add(modifiedIndex);
72
	};
73

74
	for (let i = 0; i < modifiedCells.length; i++) {
75
		const modifiedCell = modifiedCells[i];
76
		const { index, editCount: dist, percentage } = computeClosestCell({ cell: modifiedCell, index: i }, originalCells, true, cache, canOriginalIndexBeMappedToModifiedIndex);
77
		if (index >= 0 && dist === 0) {
78
			trackMappedIndexes(i, index);
79
			results.push({ modified: i, original: index, dist, percentage, possibleOriginal: index });
80
		} else {
81
			originalIndexWithMostEdits.set(index, { dist: dist, modifiedIndex: i });
82
			results.push({ modified: i, original: -1, dist: dist, percentage, possibleOriginal: index });
83
		}
84
	}
85

86
	results.forEach((result, i) => {
87
		if (result.original >= 0) {
88
			return;
89
		}
90

91
		/**
92
		 * I.e. Assume you have the following
93
		 * =================
94
		 * A a (this has ben matched)
95
		 * B b <not matched>
96
		 * C c <not matched>
97
		 * D d (these two have been matched)
98
		 * e e
99
		 * f f
100
		 * =================
101
		 * Just match A => a, B => b, C => c
102
		 */
103
		// Find the next cell that has been matched.
104
		const previousMatchedCell = i > 0 ? results.slice(0, i).reverse().find(r => r.original >= 0) : undefined;
105
		const previousMatchedOriginalIndex = previousMatchedCell?.original ?? -1;
106
		const previousMatchedModifiedIndex = previousMatchedCell?.modified ?? -1;
107
		const matchedCell = results.slice(i + 1).find(r => r.original >= 0);
108
		const unavailableIndexes = new Set<number>();
109
		const nextMatchedModifiedIndex = results.findIndex((item, idx) => idx > i && item.original >= 0);
110
		const nextMatchedOriginalIndex = nextMatchedModifiedIndex >= 0 ? results[nextMatchedModifiedIndex].original : -1;
111
		// Find the available indexes that we can match with.
112
		// We are only interested in b and c (anything after d is of no use).
113
		originalCells.forEach((_, i) => {
114
			if (mappedOriginalCellToModifiedCell.has(i)) {
115
				unavailableIndexes.add(i);
116
				return;
117
			}
118
			if (matchedCell && i >= matchedCell.original) {
119
				unavailableIndexes.add(i);
120
			}
121
			if (nextMatchedOriginalIndex >= 0 && i > nextMatchedOriginalIndex) {
122
				unavailableIndexes.add(i);
123
			}
124
		});
125

126

127
		const modifiedCell = modifiedCells[i];
128
		/**
129
		 * I.e. Assume you have the following
130
		 * =================
131
		 * A a (this has ben matched)
132
		 * B b <not matched because the % of change is too high, but we do have a probable match>
133
		 * C c <not matched>
134
		 * D d (these two have been matched)
135
		 * e e
136
		 * f f
137
		 * =================
138
		 * Given that we have a probable match for B => b, we can match it.
139
		 */
140
		if (result.original === -1 && result.possibleOriginal >= 0 && !unavailableIndexes.has(result.possibleOriginal) && canOriginalIndexBeMappedToModifiedIndex(result.possibleOriginal, { editCount: result.dist })) {
141
			trackMappedIndexes(i, result.possibleOriginal);
142
			result.original = result.possibleOriginal;
143
			return;
144
		}
145

146

147
		/**
148
		 * I.e. Assume you have the following
149
		 * =================
150
		 * A a (this has ben matched)
151
		 * B b <not matched>
152
		 * C c <not matched>
153
		 * D d (these two have been matched)
154
		 * =================
155
		 * Its possible that B matches better with c and C matches better with b.
156
		 * However given the fact that we have matched A => a and D => d.
157
		 * & if the indexes are an exact match.
158
		 * I.e. index of D in Modified === index of d in Original, and index of A in Modified === index of a in Original.
159
		 * Then this means there are absolutely no modifications.
160
		 * Hence we can just assign the indexes as is.
161
		 *
162
		 * NOTE: For this, we must ensure we have exactly the same number of items on either side.
163
		 * I.e. we have B, C remaining in Modified, and b, c remaining in Original.
164
		 * Thats 2 Modified items === 2 Original Items.
165
		 * If its not the same, then this means something has been deleted/inserted, and we cannot blindly map the indexes.
166
		*/
167
		if (previousMatchedOriginalIndex > 0 && previousMatchedModifiedIndex > 0 && previousMatchedOriginalIndex === previousMatchedModifiedIndex) {
168
			if ((nextMatchedModifiedIndex >= 0 ? nextMatchedModifiedIndex : modifiedCells.length - 1) === (nextMatchedOriginalIndex >= 0 ? nextMatchedOriginalIndex : originalCells.length - 1) && !unavailableIndexes.has(i) && i < originalCells.length) {
169
				const remainingModifiedItems = (nextMatchedModifiedIndex >= 0 ? nextMatchedModifiedIndex : modifiedCells.length) - previousMatchedModifiedIndex;
170
				const remainingOriginalItems = (nextMatchedOriginalIndex >= 0 ? nextMatchedOriginalIndex : originalCells.length) - previousMatchedOriginalIndex;
171
				if (remainingModifiedItems === remainingOriginalItems && modifiedCell.cellKind === originalCells[i].cellKind) {
172
					trackMappedIndexes(i, i);
173
					result.original = i;
174
					return;
175
				}
176
			}
177
		}
178
		/**
179
		 * I.e. Assume you have the following
180
		 * =================
181
		 * A a (this has ben matched)
182
		 * B b <not matched>
183
		 * C c <not matched>
184
		 * D d (these two have been matched)
185
		 * e e
186
		 * f f
187
		 * =================
188
		 * We can now try to match B with b and c and figure out which is best.
189
		 * RULE 1. Its possible that B will match best with c, howevber C matches better with c, meaning we should match B with b.
190
		 * To do this, we need to see if c has a better match with something else.
191
		*/
192
		// RULE 1
193
		// Try to find the next best match, but exclucde items that have a better match.
194
		const { index, percentage } = computeClosestCell({ cell: modifiedCell, index: i }, originalCells, false, cache, (originalIndex: number, originalValue: { editCount: EditCount }) => {
195
			if (unavailableIndexes.has(originalIndex)) {
196
				return false;
197
			}
198

199
			if (nextMatchedModifiedIndex > 0 || previousMatchedOriginalIndex > 0) {
200
				// See if we have a beter match for this.
201
				const matchesForThisOriginalIndex = cache.originalToModified.get(originalIndex);
202
				if (matchesForThisOriginalIndex && previousMatchedOriginalIndex < originalIndex) {
203
					const betterMatch = Array.from(matchesForThisOriginalIndex).find(([modifiedIndex, value]) => {
204
						if (modifiedIndex === i) {
205
							// This is the same modifeid entry.
206
							return false;
207
						}
208
						if (modifiedIndex >= nextMatchedModifiedIndex) {
209
							// We're only interested in matches that are before the next matched index.
210
							return false;
211
						}
212
						if (mappedModifiedIndexes.has(i)) {
213
							// This has already been matched.
214
							return false;
215
						}
216
						return value.editCount < originalValue.editCount;
217
					});
218
					if (betterMatch) {
219
						// We do have a better match for this, hence do not use this.
220
						return false;
221
					}
222
				}
223
			}
224
			return !unavailableIndexes.has(originalIndex);
225
		});
226

227
		/**
228
		 * I.e. Assume you have the following
229
		 * =================
230
		 * A a (this has ben matched)
231
		 * B bbbbbbbbbbbbbb <not matched>
232
		 * C cccccccccccccc <not matched>
233
		 * D d (these two have been matched)
234
		 * e e
235
		 * f f
236
		 * =================
237
		 * RULE 1 . Now when attempting to match `bbbbbbbbbbbb` with B, the number of edits is very high and the percentage is also very high.
238
		 * Basically majority of the text needs to be changed.
239
		 * However if the indexes line up perfectly well, and this is the best match, then use it.
240
		*
241
		 * Similarly its possible we're trying to match b with `BBBBBBBBBBBB` and the number of edits is very high, but the indexes line up perfectly well.
242
		*
243
		* RULE 2. However it is also possible that there's a better match with another cell
244
		* Assume we have
245
		 * =================
246
		 * AAAA     a (this has been matched)
247
		 * bbbbbbbb b <not matched>
248
		 * bbbb     c <not matched>
249
		 * dddd     d (these two have been matched)
250
		 * =================
251
		 * In this case if we use the algorithm of (1) above, we'll end up matching bbbb with b, and bbbbbbbb with c.
252
		 * But we're not really sure if this is the best match.
253
		 * In such cases try to match with the same cell index.
254
		 *
255
		*/
256
		// RULE 1 (got a match and the indexes line up perfectly well, use it regardless of the number of edits).
257
		if (index >= 0 && i > 0 && results[i - 1].original === index - 1) {
258
			trackMappedIndexes(i, index);
259
			results[i].original = index;
260
			return;
261
		}
262

263
		// RULE 2
264
		// Here we know that `AAAA => a`
265
		// Check if the previous cell has been matched.
266
		// And if the next modified and next original cells are a match.
267
		const nextOriginalCell = (i > 0 && originalCells.length > results[i - 1].original) ? results[i - 1].original + 1 : -1;
268
		const nextOriginalCellValue = i > 0 && nextOriginalCell >= 0 && nextOriginalCell < originalCells.length ? originalCells[nextOriginalCell].getValue() : undefined;
269
		if (index >= 0 && i > 0 && typeof nextOriginalCellValue === 'string' && !mappedOriginalCellToModifiedCell.has(nextOriginalCell)) {
270
			if (modifiedCell.getValue().includes(nextOriginalCellValue) || nextOriginalCellValue.includes(modifiedCell.getValue())) {
271
				trackMappedIndexes(i, nextOriginalCell);
272
				results[i].original = nextOriginalCell;
273
				return;
274
			}
275
		}
276

277
		if (percentage < 90 || (i === 0 && results.length === 1)) {
278
			trackMappedIndexes(i, index);
279
			results[i].original = index;
280
			return;
281
		}
282
	});
283

284
	return results;
285
}
286

287
function computeClosestCell({ cell, index: cellIndex }: { cell: ICell; index: number }, arr: readonly ICell[], ignoreEmptyCells: boolean, cache: CellEditCountCache, canOriginalIndexBeMappedToModifiedIndex: (originalIndex: number, value: { editCount: EditCount }) => boolean): { index: number; editCount: number; percentage: number } {
288
	let min_edits = Infinity;
289
	let min_index = -1;
290

291
	// Always give preference to internal Cell Id if found.
292
	const internalId = cell.internalMetadata?.internalId;
293
	if (internalId) {
294
		const internalIdIndex = arr.findIndex(cell => cell.internalMetadata?.internalId === internalId);
295
		if (internalIdIndex >= 0) {
296
			return { index: internalIdIndex, editCount: 0, percentage: Number.MAX_SAFE_INTEGER };
297
		}
298
	}
299

300
	for (let i = 0; i < arr.length; i++) {
301
		// Skip cells that are not of the same kind.
302
		if (arr[i].cellKind !== cell.cellKind) {
303
			continue;
304
		}
305
		const str = arr[i].getValue();
306
		const cacheEntry = cache.modifiedToOriginal.get(cellIndex) ?? new Map<OriginalIndex, { editCount: EditCount }>();
307
		const value = cacheEntry.get(i) ?? { editCount: computeNumberOfEdits(cell, arr[i]), };
308
		cacheEntry.set(i, value);
309
		cache.modifiedToOriginal.set(cellIndex, cacheEntry);
310

311
		const originalCacheEntry = cache.originalToModified.get(i) ?? new Map<ModifiedIndex, { editCount: EditCount }>();
312
		originalCacheEntry.set(cellIndex, value);
313
		cache.originalToModified.set(i, originalCacheEntry);
314

315
		if (!canOriginalIndexBeMappedToModifiedIndex(i, value)) {
316
			continue;
317
		}
318
		if (str.length === 0 && ignoreEmptyCells) {
319
			continue;
320
		}
321
		if (str === cell.getValue() && cell.getValue().length > 0) {
322
			return { index: i, editCount: 0, percentage: 0 };
323
		}
324

325
		if (value.editCount < min_edits) {
326
			min_edits = value.editCount;
327
			min_index = i;
328
		}
329
	}
330

331
	if (min_index === -1) {
332
		return { index: -1, editCount: Number.MAX_SAFE_INTEGER, percentage: Number.MAX_SAFE_INTEGER };
333
	}
334
	const percentage = !cell.getValue().length && !arr[min_index].getValue().length ? 0 : (cell.getValue().length ? (min_edits * 100 / cell.getValue().length) : Number.MAX_SAFE_INTEGER);
335
	return { index: min_index, editCount: min_edits, percentage };
336
}
337

338
function computeNumberOfEdits(modified: ICell, original: ICell) {
339
	if (modified.getValue() === original.getValue()) {
340
		return 0;
341
	}
342

343
	return computeLevenshteinDistance(modified.getValue(), original.getValue());
344
}
345

346
Product

Resources

Company