Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/chunking/node/naiveChunker.ts
13401 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { ITokenizer } from '../../../util/common/tokenizer';
7
import { CancellationToken } from '../../../util/vs/base/common/cancellation';
8
import { commonPrefixLength, isFalsyOrWhitespace, splitLines } from '../../../util/vs/base/common/strings';
9
import { URI } from '../../../util/vs/base/common/uri';
10
import { Range } from '../../../util/vs/editor/common/core/range';
11
import { ITokenizerProvider, TokenizationEndpoint } from '../../tokenizer/node/tokenizer';
12
import { FileChunk } from '../common/chunk';
13
14
export const MAX_CHUNK_SIZE_TOKENS = 250;
15
16
interface IChunkedLine {
17
readonly text: string;
18
readonly lineNumber: number;
19
}
20
21
export class NaiveChunker {
22
private readonly tokenizer: ITokenizer;
23
24
constructor(
25
endpoint: TokenizationEndpoint,
26
@ITokenizerProvider tokenizerProvider: ITokenizerProvider
27
) {
28
this.tokenizer = tokenizerProvider.acquireTokenizer(endpoint);
29
}
30
31
async chunkFile(uri: URI, text: string, {
32
maxTokenLength = MAX_CHUNK_SIZE_TOKENS,
33
removeEmptyLines = true,
34
}: {
35
maxTokenLength?: number;
36
removeEmptyLines?: boolean;
37
}, token: CancellationToken): Promise<FileChunk[]> {
38
const chunks: FileChunk[] = [];
39
for await (const chunk of this._processLinesIntoChunks(
40
uri, text,
41
maxTokenLength,
42
true,
43
removeEmptyLines,
44
token
45
)) {
46
if (token.isCancellationRequested) {
47
return [];
48
}
49
50
if (!removeEmptyLines || (!!chunk.text.length && /[\w\d]{2}/.test(chunk.text))) {
51
chunks.push(chunk);
52
}
53
}
54
return chunks;
55
}
56
57
private async *_processLinesIntoChunks(
58
uri: URI,
59
text: string,
60
maxTokenLength: number,
61
shouldDedent: boolean,
62
removeEmptyLines: boolean,
63
token: CancellationToken,
64
): AsyncIterable<FileChunk> {
65
const originalLines = splitLines(text);
66
67
const accumulatingChunk: IChunkedLine[] = [];
68
let usedTokensInChunk = 0;
69
let longestCommonWhitespaceInChunk: string | undefined;
70
71
for (let i = 0; i < originalLines.length; ++i) {
72
const line = originalLines[i];
73
if (removeEmptyLines && isFalsyOrWhitespace(line)) {
74
continue;
75
}
76
77
const lineText = line.slice(0, maxTokenLength * 4).trimEnd();
78
const lineTokenCount = await this.tokenizer.tokenLength(lineText);
79
if (token.isCancellationRequested) {
80
return;
81
}
82
83
if (longestCommonWhitespaceInChunk === undefined || longestCommonWhitespaceInChunk.length > 0) {
84
const leadingWhitespaceMatches = line.match(/^\s+/);
85
const currentLeadingWhitespace = leadingWhitespaceMatches ? leadingWhitespaceMatches[0] : '';
86
87
longestCommonWhitespaceInChunk = longestCommonWhitespaceInChunk
88
? commonLeadingStr(longestCommonWhitespaceInChunk, currentLeadingWhitespace)
89
: currentLeadingWhitespace;
90
}
91
92
if (usedTokensInChunk + lineTokenCount > maxTokenLength) {
93
// Emit previous chunk and reset state
94
const chunk = this.finalizeChunk(uri, accumulatingChunk, shouldDedent, longestCommonWhitespaceInChunk ?? '', false);
95
if (chunk) {
96
yield chunk;
97
}
98
99
accumulatingChunk.length = 0;
100
usedTokensInChunk = 0;
101
longestCommonWhitespaceInChunk = undefined;
102
}
103
104
accumulatingChunk.push({
105
text: lineText,
106
lineNumber: i,
107
});
108
usedTokensInChunk += lineTokenCount;
109
}
110
111
const finalChunk = this.finalizeChunk(uri, accumulatingChunk, shouldDedent, longestCommonWhitespaceInChunk ?? '', true);
112
if (finalChunk) {
113
yield finalChunk;
114
}
115
}
116
117
private finalizeChunk(file: URI, chunkLines: readonly IChunkedLine[], shouldDedent: boolean, leadingWhitespace: string, isLastChunk: boolean): FileChunk | undefined {
118
if (!chunkLines.length) {
119
return undefined;
120
}
121
122
const finalizedChunkText = shouldDedent
123
? chunkLines.map(x => x.text.substring(leadingWhitespace.length)).join('\n')
124
: chunkLines.map(x => x.text).join('\n');
125
126
const lastLine = chunkLines[chunkLines.length - 1];
127
return {
128
file: file,
129
// For naive chunking, the raw text is the same as the processed text
130
text: finalizedChunkText,
131
rawText: finalizedChunkText,
132
isFullFile: isLastChunk && chunkLines[0].lineNumber === 0,
133
range: new Range(
134
chunkLines[0].lineNumber,
135
0,
136
lastLine.lineNumber,
137
lastLine.text.length,
138
),
139
};
140
}
141
}
142
143
export function trimCommonLeadingWhitespace(lines: string[]): { trimmedLines: string[]; shortestLeadingCommonWhitespace: string } {
144
let longestCommonWhitespace: string | undefined;
145
for (const line of lines) {
146
const leadingWhitespaceMatches = line.match(/^\s+/);
147
const currentLeadingWhitespace = leadingWhitespaceMatches ? leadingWhitespaceMatches[0] : '';
148
149
if (longestCommonWhitespace === undefined) {
150
longestCommonWhitespace = currentLeadingWhitespace;
151
} else {
152
longestCommonWhitespace = commonLeadingStr(longestCommonWhitespace, currentLeadingWhitespace);
153
}
154
155
if (!longestCommonWhitespace || longestCommonWhitespace.length === 0) {
156
// No common leading whitespace, no need to continue
157
return {
158
trimmedLines: lines,
159
shortestLeadingCommonWhitespace: '',
160
};
161
}
162
}
163
164
const dedentLength = (longestCommonWhitespace ?? '').length;
165
return {
166
trimmedLines: lines.map(e => e.substring(dedentLength)),
167
shortestLeadingCommonWhitespace: longestCommonWhitespace ?? '',
168
};
169
}
170
171
function commonLeadingStr(str1: string, str2: string) {
172
const prefixLength = commonPrefixLength(str1, str2);
173
return str1.substring(0, prefixLength);
174
}
175
176