CoCalc -- naiveChunkerService.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/chunking/node/naiveChunkerService.ts
¹³⁴⁰¹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import { createServiceIdentifier } from '../../../util/common/services';
7
import { CancellationToken } from '../../../util/vs/base/common/cancellation';
8
import { Uri } from '../../../vscodeTypes';
9
import { ITokenizerProvider, TokenizationEndpoint } from '../../tokenizer/node/tokenizer';
10
import { FileChunk } from '../common/chunk';
11
import { MAX_CHUNK_SIZE_TOKENS, NaiveChunker } from './naiveChunker';
12

13
interface NaiveChunkingOptions {
14
	/**
15
	 * The desired maximum length of each chunk in tokens
16
	 */
17
	readonly maxTokenLength?: number;
18
	readonly validateChunkLengths?: boolean;
19
	readonly includeExtraBodyOutsideRange?: boolean; // only gets applied if limitToRange is set
20
}
21

22
export interface INaiveChunkingService {
23

24
	/**
25
	 * Splits `text` into smaller chunks of roughly equal length using a scrolling window approach.
26
	 */
27
	chunkFile(endpoint: TokenizationEndpoint, fileUri: Uri, text: string, options: NaiveChunkingOptions, token: CancellationToken): Promise<FileChunk[]>;
28
}
29

30
export const INaiveChunkingService = createServiceIdentifier<INaiveChunkingService>('INaiveChunkingService');
31

32
export class NaiveChunkingService implements INaiveChunkingService {
33

34
	declare _serviceBrand: undefined;
35

36
	private readonly naiveChunkers = new Map</*endpoint */ string, NaiveChunker>();
37

38
	constructor(
39
		@ITokenizerProvider private readonly tokenizerProvider: ITokenizerProvider,
40
	) { }
41

42
	async chunkFile(endpoint: TokenizationEndpoint, uri: Uri, text: string, options: NaiveChunkingOptions, token: CancellationToken): Promise<FileChunk[]> {
43
		const maxTokenLength = options?.maxTokenLength ?? MAX_CHUNK_SIZE_TOKENS;
44

45
		const out = await this.getNaiveChunker(endpoint).chunkFile(uri, text, { maxTokenLength }, token);
46
		if (options?.validateChunkLengths) {
47
			await this.validateChunkLengths(out, maxTokenLength, endpoint);
48
		}
49

50
		return out.filter(x => x.text);
51
	}
52

53
	private getNaiveChunker(endpoint: TokenizationEndpoint): NaiveChunker {
54
		const cached = this.naiveChunkers.get(endpoint.tokenizer);
55
		if (cached) {
56
			return cached;
57
		}
58

59
		const chunker = new NaiveChunker(endpoint, this.tokenizerProvider);
60
		this.naiveChunkers.set(endpoint.tokenizer, chunker);
61
		return chunker;
62
	}
63

64
	private async validateChunkLengths(chunks: FileChunk[], maxTokenLength: number, endpoint: TokenizationEndpoint) {
65
		for (const chunk of chunks) {
66
			const tokenLength = await this.tokenizerProvider.acquireTokenizer(endpoint).tokenLength(chunk.text);
67
			if (tokenLength > maxTokenLength * 1.2) {
68
				console.warn('Produced chunk that is over length limit', { file: chunk.file + '', range: chunk.range, chunkTokenLength: tokenLength, maxLength: maxTokenLength });
69
			}
70
		}
71
	}
72
}
73

74

75
Product

Resources

Company