Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/util/common/anomalyDetection.ts
13397 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
/** Sometimes the model gets caught in repeating a simplistic and unhelpful pattern
7
* This file provides functionality to check whether this might be the case
8
*/
9
interface RepetitionConfig {
10
max_token_sequence_length: number;
11
last_tokens_to_consider: number;
12
}
13
14
const configs: RepetitionConfig[] = [
15
// in case of single token, 10 repetitions is too much already:
16
{ max_token_sequence_length: 1, last_tokens_to_consider: 10 },
17
// if the last 30 tokens are a repeat of up to 10 tokens, then it's a pattern:
18
{ max_token_sequence_length: 10, last_tokens_to_consider: 30 },
19
// if the pattern is very long, it needs to account for a long stretch so we can be sure
20
{ max_token_sequence_length: 20, last_tokens_to_consider: 45 },
21
{ max_token_sequence_length: 30, last_tokens_to_consider: 60 },
22
{ max_token_sequence_length: 60, last_tokens_to_consider: 120 },
23
];
24
25
/**
26
* Given a string calculates how many times each line in the string is repeated
27
* @param text The string to analyze
28
* @returns The repeating line, the number of times it repeats, total number of lines
29
*/
30
export function calculateLineRepetitionStats(text: string): { numberOfRepetitions: number; mostRepeatedLine: string; totalLines: number } {
31
if (text.length === 0) {
32
return { numberOfRepetitions: 0, mostRepeatedLine: '', totalLines: 0 };
33
}
34
const repetitionMap = new Map<string, number>();
35
const lines = text.split('\n');
36
for (let line of lines) {
37
line = line.trim();
38
if (line.length === 0) {
39
continue;
40
}
41
const repetitions = repetitionMap.get(line) || 0;
42
repetitionMap.set(line, repetitions + 1);
43
}
44
45
let mostRepeatedLine = '';
46
let maxRepetitions = 0;
47
for (const [line, repetitions] of repetitionMap.entries()) {
48
if (repetitions > maxRepetitions) {
49
maxRepetitions = repetitions;
50
mostRepeatedLine = line;
51
}
52
}
53
54
return { numberOfRepetitions: maxRepetitions, mostRepeatedLine, totalLines: lines.length };
55
}
56
57
/**
58
* Return whether the given token array ends in a repetition of a pattern.
59
* Controlling the necessary pattern length is set in the configs array.
60
*/
61
export function isRepetitive(tokens: readonly string[]): boolean {
62
const tokensBackwards = tokens.slice();
63
tokensBackwards.reverse();
64
return (
65
isRepeatedPattern(tokensBackwards) ||
66
isRepeatedPattern(tokensBackwards.filter(token => token.trim().length > 0))
67
);
68
}
69
70
/**
71
* Determine whether the given array or string starts with the repetition of a pattern,
72
* according to one of the predefined configs.
73
*/
74
function isRepeatedPattern<T>(s: ArrayLike<T>): boolean {
75
const prefix = kmp_prefix_function(s);
76
for (const config of configs) {
77
if (s.length < config.last_tokens_to_consider) {
78
continue;
79
}
80
// This is the smallest number of characters that one may shift `s` so that it
81
// overlaps with itself. That is also the smallest length of a repeated
82
// pattern that makes up `s`, where the last repetition is possibly truncated.
83
const patternLength = config.last_tokens_to_consider - 1 - prefix[config.last_tokens_to_consider - 1];
84
if (patternLength <= config.max_token_sequence_length) {
85
return true;
86
}
87
}
88
return false;
89
}
90
91
/** Return the Knuth-Morris-Pratt prefix function pi.
92
* For each i=0,..,.s.length-1, then
93
* pi[i] = max(j < i, s.slice(0,i+1).beginsWith(s.slice(0, j+1)))
94
* (note pi[0] = -1 by this definition)
95
* Adapted from
96
* Introduction to Algorithms, 3rd edition, by Thomas H. Cormen, et al.
97
*/
98
function kmp_prefix_function<T>(s: ArrayLike<T>): number[] {
99
const pi = Array(s.length).fill(0);
100
pi[0] = -1;
101
let k = -1;
102
for (let q = 1; q < s.length; q++) {
103
while (k >= 0 && s[k + 1] !== s[q]) {
104
k = pi[k];
105
}
106
if (s[k + 1] === s[q]) {
107
k++;
108
}
109
pi[q] = k;
110
}
111
return pi;
112
}
113
114