Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/chunking/node/naiveChunkerService.ts
13401 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { createServiceIdentifier } from '../../../util/common/services';
7
import { CancellationToken } from '../../../util/vs/base/common/cancellation';
8
import { Uri } from '../../../vscodeTypes';
9
import { ITokenizerProvider, TokenizationEndpoint } from '../../tokenizer/node/tokenizer';
10
import { FileChunk } from '../common/chunk';
11
import { MAX_CHUNK_SIZE_TOKENS, NaiveChunker } from './naiveChunker';
12
13
interface NaiveChunkingOptions {
14
/**
15
* The desired maximum length of each chunk in tokens
16
*/
17
readonly maxTokenLength?: number;
18
readonly validateChunkLengths?: boolean;
19
readonly includeExtraBodyOutsideRange?: boolean; // only gets applied if limitToRange is set
20
}
21
22
export interface INaiveChunkingService {
23
24
/**
25
* Splits `text` into smaller chunks of roughly equal length using a scrolling window approach.
26
*/
27
chunkFile(endpoint: TokenizationEndpoint, fileUri: Uri, text: string, options: NaiveChunkingOptions, token: CancellationToken): Promise<FileChunk[]>;
28
}
29
30
export const INaiveChunkingService = createServiceIdentifier<INaiveChunkingService>('INaiveChunkingService');
31
32
export class NaiveChunkingService implements INaiveChunkingService {
33
34
declare _serviceBrand: undefined;
35
36
private readonly naiveChunkers = new Map</*endpoint */ string, NaiveChunker>();
37
38
constructor(
39
@ITokenizerProvider private readonly tokenizerProvider: ITokenizerProvider,
40
) { }
41
42
async chunkFile(endpoint: TokenizationEndpoint, uri: Uri, text: string, options: NaiveChunkingOptions, token: CancellationToken): Promise<FileChunk[]> {
43
const maxTokenLength = options?.maxTokenLength ?? MAX_CHUNK_SIZE_TOKENS;
44
45
const out = await this.getNaiveChunker(endpoint).chunkFile(uri, text, { maxTokenLength }, token);
46
if (options?.validateChunkLengths) {
47
await this.validateChunkLengths(out, maxTokenLength, endpoint);
48
}
49
50
return out.filter(x => x.text);
51
}
52
53
private getNaiveChunker(endpoint: TokenizationEndpoint): NaiveChunker {
54
const cached = this.naiveChunkers.get(endpoint.tokenizer);
55
if (cached) {
56
return cached;
57
}
58
59
const chunker = new NaiveChunker(endpoint, this.tokenizerProvider);
60
this.naiveChunkers.set(endpoint.tokenizer, chunker);
61
return chunker;
62
}
63
64
private async validateChunkLengths(chunks: FileChunk[], maxTokenLength: number, endpoint: TokenizationEndpoint) {
65
for (const chunk of chunks) {
66
const tokenLength = await this.tokenizerProvider.acquireTokenizer(endpoint).tokenLength(chunk.text);
67
if (tokenLength > maxTokenLength * 1.2) {
68
console.warn('Produced chunk that is over length limit', { file: chunk.file + '', range: chunk.range, chunkTokenLength: tokenLength, maxLength: maxTokenLength });
69
}
70
}
71
}
72
}
73
74
75