Path: blob/main/extensions/copilot/src/platform/chunking/node/naiveChunkerService.ts
13401 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { createServiceIdentifier } from '../../../util/common/services';6import { CancellationToken } from '../../../util/vs/base/common/cancellation';7import { Uri } from '../../../vscodeTypes';8import { ITokenizerProvider, TokenizationEndpoint } from '../../tokenizer/node/tokenizer';9import { FileChunk } from '../common/chunk';10import { MAX_CHUNK_SIZE_TOKENS, NaiveChunker } from './naiveChunker';1112interface NaiveChunkingOptions {13/**14* The desired maximum length of each chunk in tokens15*/16readonly maxTokenLength?: number;17readonly validateChunkLengths?: boolean;18readonly includeExtraBodyOutsideRange?: boolean; // only gets applied if limitToRange is set19}2021export interface INaiveChunkingService {2223/**24* Splits `text` into smaller chunks of roughly equal length using a scrolling window approach.25*/26chunkFile(endpoint: TokenizationEndpoint, fileUri: Uri, text: string, options: NaiveChunkingOptions, token: CancellationToken): Promise<FileChunk[]>;27}2829export const INaiveChunkingService = createServiceIdentifier<INaiveChunkingService>('INaiveChunkingService');3031export class NaiveChunkingService implements INaiveChunkingService {3233declare _serviceBrand: undefined;3435private readonly naiveChunkers = new Map</*endpoint */ string, NaiveChunker>();3637constructor(38@ITokenizerProvider private readonly tokenizerProvider: ITokenizerProvider,39) { }4041async chunkFile(endpoint: TokenizationEndpoint, uri: Uri, text: string, options: NaiveChunkingOptions, token: CancellationToken): Promise<FileChunk[]> {42const maxTokenLength = options?.maxTokenLength ?? MAX_CHUNK_SIZE_TOKENS;4344const out = await this.getNaiveChunker(endpoint).chunkFile(uri, text, { maxTokenLength }, token);45if (options?.validateChunkLengths) {46await this.validateChunkLengths(out, maxTokenLength, endpoint);47}4849return out.filter(x => x.text);50}5152private getNaiveChunker(endpoint: TokenizationEndpoint): NaiveChunker {53const cached = this.naiveChunkers.get(endpoint.tokenizer);54if (cached) {55return cached;56}5758const chunker = new NaiveChunker(endpoint, this.tokenizerProvider);59this.naiveChunkers.set(endpoint.tokenizer, chunker);60return chunker;61}6263private async validateChunkLengths(chunks: FileChunk[], maxTokenLength: number, endpoint: TokenizationEndpoint) {64for (const chunk of chunks) {65const tokenLength = await this.tokenizerProvider.acquireTokenizer(endpoint).tokenLength(chunk.text);66if (tokenLength > maxTokenLength * 1.2) {67console.warn('Produced chunk that is over length limit', { file: chunk.file + '', range: chunk.range, chunkTokenLength: tokenLength, maxLength: maxTokenLength });68}69}70}71}72737475