Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/workbench/contrib/notebook/common/services/notebookCellMatching.ts
3296 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { computeLevenshteinDistance } from '../../../../../base/common/diff/diff.js';
7
import { CellKind } from '../notebookCommon.js';
8
9
10
type EditCount = number;
11
type OriginalIndex = number;
12
type ModifiedIndex = number;
13
type CellEditCountCache = {
14
modifiedToOriginal: Map<ModifiedIndex, Map<OriginalIndex, { editCount: EditCount }>>;
15
originalToModified: Map<OriginalIndex, Map<ModifiedIndex, { editCount: EditCount }>>;
16
};
17
18
type ICell = {
19
internalMetadata?: {
20
internalId?: string;
21
};
22
getValue(): string;
23
getLinesContent(): string[];
24
cellKind: CellKind;
25
};
26
27
/**
28
* Given a set of modified cells and original cells, this function will attempt to match the modified cells with the original cells.
29
* E.g. Assume you have (original on left and modified on right):
30
* =================
31
* Cell A | Cell a
32
* Cell B | Cell b
33
* Cell C | Cell d
34
* Cell D | Cell e
35
* =================
36
* Here we know that `Cell C` has been removed and `Cell e` has been added.
37
* The mapping from modified to original will be as follows:
38
* Cell a => Cell A
39
* Cell b => Cell B
40
* Cell d => Cell D
41
* Cell e => <Does not match anything in original, hence a new Cell>
42
* Cell C in original was not matched, hence it was deleted.
43
*
44
* Thus the return value is as follows:
45
* [
46
* { modified: 0, original: 0 },
47
* { modified: 1, original: 1 },
48
* { modified: 2, original: 3 },
49
* { modified: 3, original: -1 },
50
* ]
51
* @returns
52
*/
53
export function matchCellBasedOnSimilarties(modifiedCells: ICell[], originalCells: ICell[]): { modified: number; original: number; percentage: number }[] {
54
const cache: CellEditCountCache = {
55
modifiedToOriginal: new Map<ModifiedIndex, Map<OriginalIndex, { editCount: EditCount }>>(),
56
originalToModified: new Map<OriginalIndex, Map<ModifiedIndex, { editCount: EditCount }>>(),
57
};
58
const results: { modified: number; original: number; dist: number; percentage: number; possibleOriginal: number }[] = [];
59
const mappedOriginalCellToModifiedCell = new Map<number, number>();
60
const mappedModifiedIndexes = new Set<number>();
61
const originalIndexWithMostEdits = new Map<number, { dist: number; modifiedIndex: number }>();
62
const canOriginalIndexBeMappedToModifiedIndex = (originalIndex: number, value: { editCount: EditCount }) => {
63
if (mappedOriginalCellToModifiedCell.has(originalIndex)) {
64
return false;
65
}
66
const existingEdits = originalIndexWithMostEdits.get(originalIndex)?.dist ?? Number.MAX_SAFE_INTEGER;
67
return value.editCount < existingEdits;
68
};
69
const trackMappedIndexes = (modifiedIndex: number, originalIndex: number) => {
70
mappedOriginalCellToModifiedCell.set(originalIndex, modifiedIndex);
71
mappedModifiedIndexes.add(modifiedIndex);
72
};
73
74
for (let i = 0; i < modifiedCells.length; i++) {
75
const modifiedCell = modifiedCells[i];
76
const { index, editCount: dist, percentage } = computeClosestCell({ cell: modifiedCell, index: i }, originalCells, true, cache, canOriginalIndexBeMappedToModifiedIndex);
77
if (index >= 0 && dist === 0) {
78
trackMappedIndexes(i, index);
79
results.push({ modified: i, original: index, dist, percentage, possibleOriginal: index });
80
} else {
81
originalIndexWithMostEdits.set(index, { dist: dist, modifiedIndex: i });
82
results.push({ modified: i, original: -1, dist: dist, percentage, possibleOriginal: index });
83
}
84
}
85
86
results.forEach((result, i) => {
87
if (result.original >= 0) {
88
return;
89
}
90
91
/**
92
* I.e. Assume you have the following
93
* =================
94
* A a (this has ben matched)
95
* B b <not matched>
96
* C c <not matched>
97
* D d (these two have been matched)
98
* e e
99
* f f
100
* =================
101
* Just match A => a, B => b, C => c
102
*/
103
// Find the next cell that has been matched.
104
const previousMatchedCell = i > 0 ? results.slice(0, i).reverse().find(r => r.original >= 0) : undefined;
105
const previousMatchedOriginalIndex = previousMatchedCell?.original ?? -1;
106
const previousMatchedModifiedIndex = previousMatchedCell?.modified ?? -1;
107
const matchedCell = results.slice(i + 1).find(r => r.original >= 0);
108
const unavailableIndexes = new Set<number>();
109
const nextMatchedModifiedIndex = results.findIndex((item, idx) => idx > i && item.original >= 0);
110
const nextMatchedOriginalIndex = nextMatchedModifiedIndex >= 0 ? results[nextMatchedModifiedIndex].original : -1;
111
// Find the available indexes that we can match with.
112
// We are only interested in b and c (anything after d is of no use).
113
originalCells.forEach((_, i) => {
114
if (mappedOriginalCellToModifiedCell.has(i)) {
115
unavailableIndexes.add(i);
116
return;
117
}
118
if (matchedCell && i >= matchedCell.original) {
119
unavailableIndexes.add(i);
120
}
121
if (nextMatchedOriginalIndex >= 0 && i > nextMatchedOriginalIndex) {
122
unavailableIndexes.add(i);
123
}
124
});
125
126
127
const modifiedCell = modifiedCells[i];
128
/**
129
* I.e. Assume you have the following
130
* =================
131
* A a (this has ben matched)
132
* B b <not matched because the % of change is too high, but we do have a probable match>
133
* C c <not matched>
134
* D d (these two have been matched)
135
* e e
136
* f f
137
* =================
138
* Given that we have a probable match for B => b, we can match it.
139
*/
140
if (result.original === -1 && result.possibleOriginal >= 0 && !unavailableIndexes.has(result.possibleOriginal) && canOriginalIndexBeMappedToModifiedIndex(result.possibleOriginal, { editCount: result.dist })) {
141
trackMappedIndexes(i, result.possibleOriginal);
142
result.original = result.possibleOriginal;
143
return;
144
}
145
146
147
/**
148
* I.e. Assume you have the following
149
* =================
150
* A a (this has ben matched)
151
* B b <not matched>
152
* C c <not matched>
153
* D d (these two have been matched)
154
* =================
155
* Its possible that B matches better with c and C matches better with b.
156
* However given the fact that we have matched A => a and D => d.
157
* & if the indexes are an exact match.
158
* I.e. index of D in Modified === index of d in Original, and index of A in Modified === index of a in Original.
159
* Then this means there are absolutely no modifications.
160
* Hence we can just assign the indexes as is.
161
*
162
* NOTE: For this, we must ensure we have exactly the same number of items on either side.
163
* I.e. we have B, C remaining in Modified, and b, c remaining in Original.
164
* Thats 2 Modified items === 2 Original Items.
165
* If its not the same, then this means something has been deleted/inserted, and we cannot blindly map the indexes.
166
*/
167
if (previousMatchedOriginalIndex > 0 && previousMatchedModifiedIndex > 0 && previousMatchedOriginalIndex === previousMatchedModifiedIndex) {
168
if ((nextMatchedModifiedIndex >= 0 ? nextMatchedModifiedIndex : modifiedCells.length - 1) === (nextMatchedOriginalIndex >= 0 ? nextMatchedOriginalIndex : originalCells.length - 1) && !unavailableIndexes.has(i) && i < originalCells.length) {
169
const remainingModifiedItems = (nextMatchedModifiedIndex >= 0 ? nextMatchedModifiedIndex : modifiedCells.length) - previousMatchedModifiedIndex;
170
const remainingOriginalItems = (nextMatchedOriginalIndex >= 0 ? nextMatchedOriginalIndex : originalCells.length) - previousMatchedOriginalIndex;
171
if (remainingModifiedItems === remainingOriginalItems && modifiedCell.cellKind === originalCells[i].cellKind) {
172
trackMappedIndexes(i, i);
173
result.original = i;
174
return;
175
}
176
}
177
}
178
/**
179
* I.e. Assume you have the following
180
* =================
181
* A a (this has ben matched)
182
* B b <not matched>
183
* C c <not matched>
184
* D d (these two have been matched)
185
* e e
186
* f f
187
* =================
188
* We can now try to match B with b and c and figure out which is best.
189
* RULE 1. Its possible that B will match best with c, howevber C matches better with c, meaning we should match B with b.
190
* To do this, we need to see if c has a better match with something else.
191
*/
192
// RULE 1
193
// Try to find the next best match, but exclucde items that have a better match.
194
const { index, percentage } = computeClosestCell({ cell: modifiedCell, index: i }, originalCells, false, cache, (originalIndex: number, originalValue: { editCount: EditCount }) => {
195
if (unavailableIndexes.has(originalIndex)) {
196
return false;
197
}
198
199
if (nextMatchedModifiedIndex > 0 || previousMatchedOriginalIndex > 0) {
200
// See if we have a beter match for this.
201
const matchesForThisOriginalIndex = cache.originalToModified.get(originalIndex);
202
if (matchesForThisOriginalIndex && previousMatchedOriginalIndex < originalIndex) {
203
const betterMatch = Array.from(matchesForThisOriginalIndex).find(([modifiedIndex, value]) => {
204
if (modifiedIndex === i) {
205
// This is the same modifeid entry.
206
return false;
207
}
208
if (modifiedIndex >= nextMatchedModifiedIndex) {
209
// We're only interested in matches that are before the next matched index.
210
return false;
211
}
212
if (mappedModifiedIndexes.has(i)) {
213
// This has already been matched.
214
return false;
215
}
216
return value.editCount < originalValue.editCount;
217
});
218
if (betterMatch) {
219
// We do have a better match for this, hence do not use this.
220
return false;
221
}
222
}
223
}
224
return !unavailableIndexes.has(originalIndex);
225
});
226
227
/**
228
* I.e. Assume you have the following
229
* =================
230
* A a (this has ben matched)
231
* B bbbbbbbbbbbbbb <not matched>
232
* C cccccccccccccc <not matched>
233
* D d (these two have been matched)
234
* e e
235
* f f
236
* =================
237
* RULE 1 . Now when attempting to match `bbbbbbbbbbbb` with B, the number of edits is very high and the percentage is also very high.
238
* Basically majority of the text needs to be changed.
239
* However if the indexes line up perfectly well, and this is the best match, then use it.
240
*
241
* Similarly its possible we're trying to match b with `BBBBBBBBBBBB` and the number of edits is very high, but the indexes line up perfectly well.
242
*
243
* RULE 2. However it is also possible that there's a better match with another cell
244
* Assume we have
245
* =================
246
* AAAA a (this has been matched)
247
* bbbbbbbb b <not matched>
248
* bbbb c <not matched>
249
* dddd d (these two have been matched)
250
* =================
251
* In this case if we use the algorithm of (1) above, we'll end up matching bbbb with b, and bbbbbbbb with c.
252
* But we're not really sure if this is the best match.
253
* In such cases try to match with the same cell index.
254
*
255
*/
256
// RULE 1 (got a match and the indexes line up perfectly well, use it regardless of the number of edits).
257
if (index >= 0 && i > 0 && results[i - 1].original === index - 1) {
258
trackMappedIndexes(i, index);
259
results[i].original = index;
260
return;
261
}
262
263
// RULE 2
264
// Here we know that `AAAA => a`
265
// Check if the previous cell has been matched.
266
// And if the next modified and next original cells are a match.
267
const nextOriginalCell = (i > 0 && originalCells.length > results[i - 1].original) ? results[i - 1].original + 1 : -1;
268
const nextOriginalCellValue = i > 0 && nextOriginalCell >= 0 && nextOriginalCell < originalCells.length ? originalCells[nextOriginalCell].getValue() : undefined;
269
if (index >= 0 && i > 0 && typeof nextOriginalCellValue === 'string' && !mappedOriginalCellToModifiedCell.has(nextOriginalCell)) {
270
if (modifiedCell.getValue().includes(nextOriginalCellValue) || nextOriginalCellValue.includes(modifiedCell.getValue())) {
271
trackMappedIndexes(i, nextOriginalCell);
272
results[i].original = nextOriginalCell;
273
return;
274
}
275
}
276
277
if (percentage < 90 || (i === 0 && results.length === 1)) {
278
trackMappedIndexes(i, index);
279
results[i].original = index;
280
return;
281
}
282
});
283
284
return results;
285
}
286
287
function computeClosestCell({ cell, index: cellIndex }: { cell: ICell; index: number }, arr: readonly ICell[], ignoreEmptyCells: boolean, cache: CellEditCountCache, canOriginalIndexBeMappedToModifiedIndex: (originalIndex: number, value: { editCount: EditCount }) => boolean): { index: number; editCount: number; percentage: number } {
288
let min_edits = Infinity;
289
let min_index = -1;
290
291
// Always give preference to internal Cell Id if found.
292
const internalId = cell.internalMetadata?.internalId;
293
if (internalId) {
294
const internalIdIndex = arr.findIndex(cell => cell.internalMetadata?.internalId === internalId);
295
if (internalIdIndex >= 0) {
296
return { index: internalIdIndex, editCount: 0, percentage: Number.MAX_SAFE_INTEGER };
297
}
298
}
299
300
for (let i = 0; i < arr.length; i++) {
301
// Skip cells that are not of the same kind.
302
if (arr[i].cellKind !== cell.cellKind) {
303
continue;
304
}
305
const str = arr[i].getValue();
306
const cacheEntry = cache.modifiedToOriginal.get(cellIndex) ?? new Map<OriginalIndex, { editCount: EditCount }>();
307
const value = cacheEntry.get(i) ?? { editCount: computeNumberOfEdits(cell, arr[i]), };
308
cacheEntry.set(i, value);
309
cache.modifiedToOriginal.set(cellIndex, cacheEntry);
310
311
const originalCacheEntry = cache.originalToModified.get(i) ?? new Map<ModifiedIndex, { editCount: EditCount }>();
312
originalCacheEntry.set(cellIndex, value);
313
cache.originalToModified.set(i, originalCacheEntry);
314
315
if (!canOriginalIndexBeMappedToModifiedIndex(i, value)) {
316
continue;
317
}
318
if (str.length === 0 && ignoreEmptyCells) {
319
continue;
320
}
321
if (str === cell.getValue() && cell.getValue().length > 0) {
322
return { index: i, editCount: 0, percentage: 0 };
323
}
324
325
if (value.editCount < min_edits) {
326
min_edits = value.editCount;
327
min_index = i;
328
}
329
}
330
331
if (min_index === -1) {
332
return { index: -1, editCount: Number.MAX_SAFE_INTEGER, percentage: Number.MAX_SAFE_INTEGER };
333
}
334
const percentage = !cell.getValue().length && !arr[min_index].getValue().length ? 0 : (cell.getValue().length ? (min_edits * 100 / cell.getValue().length) : Number.MAX_SAFE_INTEGER);
335
return { index: min_index, editCount: min_edits, percentage };
336
}
337
338
function computeNumberOfEdits(modified: ICell, original: ICell) {
339
if (modified.getValue() === original.getValue()) {
340
return 0;
341
}
342
343
return computeLevenshteinDistance(modified.getValue(), original.getValue());
344
}
345
346