Path: blob/main/extensions/copilot/src/platform/chunking/node/naiveChunker.ts
13401 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { ITokenizer } from '../../../util/common/tokenizer';6import { CancellationToken } from '../../../util/vs/base/common/cancellation';7import { commonPrefixLength, isFalsyOrWhitespace, splitLines } from '../../../util/vs/base/common/strings';8import { URI } from '../../../util/vs/base/common/uri';9import { Range } from '../../../util/vs/editor/common/core/range';10import { ITokenizerProvider, TokenizationEndpoint } from '../../tokenizer/node/tokenizer';11import { FileChunk } from '../common/chunk';1213export const MAX_CHUNK_SIZE_TOKENS = 250;1415interface IChunkedLine {16readonly text: string;17readonly lineNumber: number;18}1920export class NaiveChunker {21private readonly tokenizer: ITokenizer;2223constructor(24endpoint: TokenizationEndpoint,25@ITokenizerProvider tokenizerProvider: ITokenizerProvider26) {27this.tokenizer = tokenizerProvider.acquireTokenizer(endpoint);28}2930async chunkFile(uri: URI, text: string, {31maxTokenLength = MAX_CHUNK_SIZE_TOKENS,32removeEmptyLines = true,33}: {34maxTokenLength?: number;35removeEmptyLines?: boolean;36}, token: CancellationToken): Promise<FileChunk[]> {37const chunks: FileChunk[] = [];38for await (const chunk of this._processLinesIntoChunks(39uri, text,40maxTokenLength,41true,42removeEmptyLines,43token44)) {45if (token.isCancellationRequested) {46return [];47}4849if (!removeEmptyLines || (!!chunk.text.length && /[\w\d]{2}/.test(chunk.text))) {50chunks.push(chunk);51}52}53return chunks;54}5556private async *_processLinesIntoChunks(57uri: URI,58text: string,59maxTokenLength: number,60shouldDedent: boolean,61removeEmptyLines: boolean,62token: CancellationToken,63): AsyncIterable<FileChunk> {64const originalLines = splitLines(text);6566const accumulatingChunk: IChunkedLine[] = [];67let usedTokensInChunk = 0;68let longestCommonWhitespaceInChunk: string | undefined;6970for (let i = 0; i < originalLines.length; ++i) {71const line = originalLines[i];72if (removeEmptyLines && isFalsyOrWhitespace(line)) {73continue;74}7576const lineText = line.slice(0, maxTokenLength * 4).trimEnd();77const lineTokenCount = await this.tokenizer.tokenLength(lineText);78if (token.isCancellationRequested) {79return;80}8182if (longestCommonWhitespaceInChunk === undefined || longestCommonWhitespaceInChunk.length > 0) {83const leadingWhitespaceMatches = line.match(/^\s+/);84const currentLeadingWhitespace = leadingWhitespaceMatches ? leadingWhitespaceMatches[0] : '';8586longestCommonWhitespaceInChunk = longestCommonWhitespaceInChunk87? commonLeadingStr(longestCommonWhitespaceInChunk, currentLeadingWhitespace)88: currentLeadingWhitespace;89}9091if (usedTokensInChunk + lineTokenCount > maxTokenLength) {92// Emit previous chunk and reset state93const chunk = this.finalizeChunk(uri, accumulatingChunk, shouldDedent, longestCommonWhitespaceInChunk ?? '', false);94if (chunk) {95yield chunk;96}9798accumulatingChunk.length = 0;99usedTokensInChunk = 0;100longestCommonWhitespaceInChunk = undefined;101}102103accumulatingChunk.push({104text: lineText,105lineNumber: i,106});107usedTokensInChunk += lineTokenCount;108}109110const finalChunk = this.finalizeChunk(uri, accumulatingChunk, shouldDedent, longestCommonWhitespaceInChunk ?? '', true);111if (finalChunk) {112yield finalChunk;113}114}115116private finalizeChunk(file: URI, chunkLines: readonly IChunkedLine[], shouldDedent: boolean, leadingWhitespace: string, isLastChunk: boolean): FileChunk | undefined {117if (!chunkLines.length) {118return undefined;119}120121const finalizedChunkText = shouldDedent122? chunkLines.map(x => x.text.substring(leadingWhitespace.length)).join('\n')123: chunkLines.map(x => x.text).join('\n');124125const lastLine = chunkLines[chunkLines.length - 1];126return {127file: file,128// For naive chunking, the raw text is the same as the processed text129text: finalizedChunkText,130rawText: finalizedChunkText,131isFullFile: isLastChunk && chunkLines[0].lineNumber === 0,132range: new Range(133chunkLines[0].lineNumber,1340,135lastLine.lineNumber,136lastLine.text.length,137),138};139}140}141142export function trimCommonLeadingWhitespace(lines: string[]): { trimmedLines: string[]; shortestLeadingCommonWhitespace: string } {143let longestCommonWhitespace: string | undefined;144for (const line of lines) {145const leadingWhitespaceMatches = line.match(/^\s+/);146const currentLeadingWhitespace = leadingWhitespaceMatches ? leadingWhitespaceMatches[0] : '';147148if (longestCommonWhitespace === undefined) {149longestCommonWhitespace = currentLeadingWhitespace;150} else {151longestCommonWhitespace = commonLeadingStr(longestCommonWhitespace, currentLeadingWhitespace);152}153154if (!longestCommonWhitespace || longestCommonWhitespace.length === 0) {155// No common leading whitespace, no need to continue156return {157trimmedLines: lines,158shortestLeadingCommonWhitespace: '',159};160}161}162163const dedentLength = (longestCommonWhitespace ?? '').length;164return {165trimmedLines: lines.map(e => e.substring(dedentLength)),166shortestLeadingCommonWhitespace: longestCommonWhitespace ?? '',167};168}169170function commonLeadingStr(str1: string, str2: string) {171const prefixLength = commonPrefixLength(str1, str2);172return str1.substring(0, prefixLength);173}174175176