CoCalc -- naiveChunker.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/chunking/node/naiveChunker.ts
¹³⁴⁰¹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import { ITokenizer } from '../../../util/common/tokenizer';
7
import { CancellationToken } from '../../../util/vs/base/common/cancellation';
8
import { commonPrefixLength, isFalsyOrWhitespace, splitLines } from '../../../util/vs/base/common/strings';
9
import { URI } from '../../../util/vs/base/common/uri';
10
import { Range } from '../../../util/vs/editor/common/core/range';
11
import { ITokenizerProvider, TokenizationEndpoint } from '../../tokenizer/node/tokenizer';
12
import { FileChunk } from '../common/chunk';
13

14
export const MAX_CHUNK_SIZE_TOKENS = 250;
15

16
interface IChunkedLine {
17
	readonly text: string;
18
	readonly lineNumber: number;
19
}
20

21
export class NaiveChunker {
22
	private readonly tokenizer: ITokenizer;
23

24
	constructor(
25
		endpoint: TokenizationEndpoint,
26
		@ITokenizerProvider tokenizerProvider: ITokenizerProvider
27
	) {
28
		this.tokenizer = tokenizerProvider.acquireTokenizer(endpoint);
29
	}
30

31
	async chunkFile(uri: URI, text: string, {
32
		maxTokenLength = MAX_CHUNK_SIZE_TOKENS,
33
		removeEmptyLines = true,
34
	}: {
35
		maxTokenLength?: number;
36
		removeEmptyLines?: boolean;
37
	}, token: CancellationToken): Promise<FileChunk[]> {
38
		const chunks: FileChunk[] = [];
39
		for await (const chunk of this._processLinesIntoChunks(
40
			uri, text,
41
			maxTokenLength,
42
			true,
43
			removeEmptyLines,
44
			token
45
		)) {
46
			if (token.isCancellationRequested) {
47
				return [];
48
			}
49

50
			if (!removeEmptyLines || (!!chunk.text.length && /[\w\d]{2}/.test(chunk.text))) {
51
				chunks.push(chunk);
52
			}
53
		}
54
		return chunks;
55
	}
56

57
	private async *_processLinesIntoChunks(
58
		uri: URI,
59
		text: string,
60
		maxTokenLength: number,
61
		shouldDedent: boolean,
62
		removeEmptyLines: boolean,
63
		token: CancellationToken,
64
	): AsyncIterable<FileChunk> {
65
		const originalLines = splitLines(text);
66

67
		const accumulatingChunk: IChunkedLine[] = [];
68
		let usedTokensInChunk = 0;
69
		let longestCommonWhitespaceInChunk: string | undefined;
70

71
		for (let i = 0; i < originalLines.length; ++i) {
72
			const line = originalLines[i];
73
			if (removeEmptyLines && isFalsyOrWhitespace(line)) {
74
				continue;
75
			}
76

77
			const lineText = line.slice(0, maxTokenLength * 4).trimEnd();
78
			const lineTokenCount = await this.tokenizer.tokenLength(lineText);
79
			if (token.isCancellationRequested) {
80
				return;
81
			}
82

83
			if (longestCommonWhitespaceInChunk === undefined || longestCommonWhitespaceInChunk.length > 0) {
84
				const leadingWhitespaceMatches = line.match(/^\s+/);
85
				const currentLeadingWhitespace = leadingWhitespaceMatches ? leadingWhitespaceMatches[0] : '';
86

87
				longestCommonWhitespaceInChunk = longestCommonWhitespaceInChunk
88
					? commonLeadingStr(longestCommonWhitespaceInChunk, currentLeadingWhitespace)
89
					: currentLeadingWhitespace;
90
			}
91

92
			if (usedTokensInChunk + lineTokenCount > maxTokenLength) {
93
				// Emit previous chunk and reset state
94
				const chunk = this.finalizeChunk(uri, accumulatingChunk, shouldDedent, longestCommonWhitespaceInChunk ?? '', false);
95
				if (chunk) {
96
					yield chunk;
97
				}
98

99
				accumulatingChunk.length = 0;
100
				usedTokensInChunk = 0;
101
				longestCommonWhitespaceInChunk = undefined;
102
			}
103

104
			accumulatingChunk.push({
105
				text: lineText,
106
				lineNumber: i,
107
			});
108
			usedTokensInChunk += lineTokenCount;
109
		}
110

111
		const finalChunk = this.finalizeChunk(uri, accumulatingChunk, shouldDedent, longestCommonWhitespaceInChunk ?? '', true);
112
		if (finalChunk) {
113
			yield finalChunk;
114
		}
115
	}
116

117
	private finalizeChunk(file: URI, chunkLines: readonly IChunkedLine[], shouldDedent: boolean, leadingWhitespace: string, isLastChunk: boolean): FileChunk | undefined {
118
		if (!chunkLines.length) {
119
			return undefined;
120
		}
121

122
		const finalizedChunkText = shouldDedent
123
			? chunkLines.map(x => x.text.substring(leadingWhitespace.length)).join('\n')
124
			: chunkLines.map(x => x.text).join('\n');
125

126
		const lastLine = chunkLines[chunkLines.length - 1];
127
		return {
128
			file: file,
129
			// For naive chunking, the raw text is the same as the processed text
130
			text: finalizedChunkText,
131
			rawText: finalizedChunkText,
132
			isFullFile: isLastChunk && chunkLines[0].lineNumber === 0,
133
			range: new Range(
134
				chunkLines[0].lineNumber,
135
				0,
136
				lastLine.lineNumber,
137
				lastLine.text.length,
138
			),
139
		};
140
	}
141
}
142

143
export function trimCommonLeadingWhitespace(lines: string[]): { trimmedLines: string[]; shortestLeadingCommonWhitespace: string } {
144
	let longestCommonWhitespace: string | undefined;
145
	for (const line of lines) {
146
		const leadingWhitespaceMatches = line.match(/^\s+/);
147
		const currentLeadingWhitespace = leadingWhitespaceMatches ? leadingWhitespaceMatches[0] : '';
148

149
		if (longestCommonWhitespace === undefined) {
150
			longestCommonWhitespace = currentLeadingWhitespace;
151
		} else {
152
			longestCommonWhitespace = commonLeadingStr(longestCommonWhitespace, currentLeadingWhitespace);
153
		}
154

155
		if (!longestCommonWhitespace || longestCommonWhitespace.length === 0) {
156
			// No common leading whitespace, no need to continue
157
			return {
158
				trimmedLines: lines,
159
				shortestLeadingCommonWhitespace: '',
160
			};
161
		}
162
	}
163

164
	const dedentLength = (longestCommonWhitespace ?? '').length;
165
	return {
166
		trimmedLines: lines.map(e => e.substring(dedentLength)),
167
		shortestLeadingCommonWhitespace: longestCommonWhitespace ?? '',
168
	};
169
}
170

171
function commonLeadingStr(str1: string, str2: string) {
172
	const prefixLength = commonPrefixLength(str1, str2);
173
	return str1.substring(0, prefixLength);
174
}
175

176
Product

Resources

Company