Path: blob/main/extensions/copilot/src/util/common/anomalyDetection.ts
13397 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45/** Sometimes the model gets caught in repeating a simplistic and unhelpful pattern6* This file provides functionality to check whether this might be the case7*/8interface RepetitionConfig {9max_token_sequence_length: number;10last_tokens_to_consider: number;11}1213const configs: RepetitionConfig[] = [14// in case of single token, 10 repetitions is too much already:15{ max_token_sequence_length: 1, last_tokens_to_consider: 10 },16// if the last 30 tokens are a repeat of up to 10 tokens, then it's a pattern:17{ max_token_sequence_length: 10, last_tokens_to_consider: 30 },18// if the pattern is very long, it needs to account for a long stretch so we can be sure19{ max_token_sequence_length: 20, last_tokens_to_consider: 45 },20{ max_token_sequence_length: 30, last_tokens_to_consider: 60 },21{ max_token_sequence_length: 60, last_tokens_to_consider: 120 },22];2324/**25* Given a string calculates how many times each line in the string is repeated26* @param text The string to analyze27* @returns The repeating line, the number of times it repeats, total number of lines28*/29export function calculateLineRepetitionStats(text: string): { numberOfRepetitions: number; mostRepeatedLine: string; totalLines: number } {30if (text.length === 0) {31return { numberOfRepetitions: 0, mostRepeatedLine: '', totalLines: 0 };32}33const repetitionMap = new Map<string, number>();34const lines = text.split('\n');35for (let line of lines) {36line = line.trim();37if (line.length === 0) {38continue;39}40const repetitions = repetitionMap.get(line) || 0;41repetitionMap.set(line, repetitions + 1);42}4344let mostRepeatedLine = '';45let maxRepetitions = 0;46for (const [line, repetitions] of repetitionMap.entries()) {47if (repetitions > maxRepetitions) {48maxRepetitions = repetitions;49mostRepeatedLine = line;50}51}5253return { numberOfRepetitions: maxRepetitions, mostRepeatedLine, totalLines: lines.length };54}5556/**57* Return whether the given token array ends in a repetition of a pattern.58* Controlling the necessary pattern length is set in the configs array.59*/60export function isRepetitive(tokens: readonly string[]): boolean {61const tokensBackwards = tokens.slice();62tokensBackwards.reverse();63return (64isRepeatedPattern(tokensBackwards) ||65isRepeatedPattern(tokensBackwards.filter(token => token.trim().length > 0))66);67}6869/**70* Determine whether the given array or string starts with the repetition of a pattern,71* according to one of the predefined configs.72*/73function isRepeatedPattern<T>(s: ArrayLike<T>): boolean {74const prefix = kmp_prefix_function(s);75for (const config of configs) {76if (s.length < config.last_tokens_to_consider) {77continue;78}79// This is the smallest number of characters that one may shift `s` so that it80// overlaps with itself. That is also the smallest length of a repeated81// pattern that makes up `s`, where the last repetition is possibly truncated.82const patternLength = config.last_tokens_to_consider - 1 - prefix[config.last_tokens_to_consider - 1];83if (patternLength <= config.max_token_sequence_length) {84return true;85}86}87return false;88}8990/** Return the Knuth-Morris-Pratt prefix function pi.91* For each i=0,..,.s.length-1, then92* pi[i] = max(j < i, s.slice(0,i+1).beginsWith(s.slice(0, j+1)))93* (note pi[0] = -1 by this definition)94* Adapted from95* Introduction to Algorithms, 3rd edition, by Thomas H. Cormen, et al.96*/97function kmp_prefix_function<T>(s: ArrayLike<T>): number[] {98const pi = Array(s.length).fill(0);99pi[0] = -1;100let k = -1;101for (let q = 1; q < s.length; q++) {102while (k >= 0 && s[k + 1] !== s[q]) {103k = pi[k];104}105if (s[k + 1] === s[q]) {106k++;107}108pi[q] = k;109}110return pi;111}112113114