Path: blob/main/extensions/copilot/script/build/compressTikToken.ts
13389 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import * as assert from 'assert';6import { mkdir, readFile, writeFile } from 'fs/promises';7import * as path from 'path';8import { parseTikTokenBinary } from '../../src/platform/tokenizer/node/parseTikTokens';9import { writeVariableLengthQuantity } from '../../src/util/common/variableLengthQuantity';1011/**12* Compresses a `.tiktoken` file into a much more compact binary format.13*14* A tiktoken file is a list of base64 encoded terms, followed by a space15* and (rather unnecessarily) by their index, like16* ```17* IQ== 018* Ig== 119* Iw== 220* JA== 321* JQ== 422* Jg== 523* Jw== 624* KA== 725* ```26*27* This compression takes advantage of the fact that term lengths increase28* monotonically with their index. Each term is represented by a VLQ-encoded29* length followed by the term itself.30*31* I explored doing a fancier format with "runs" of certain lengths, however32* the difference was only a byte or two in exchange for much higher complexity.33*/34export async function compressTikToken(inputFile: string, outputFile: string) {35const raw = await readFile(inputFile, 'utf-8');36const terms: Buffer[] = [];37for (const line of raw.split('\n')) {38if (!line) {39continue;40}4142const [base64, iStr] = line.split(' ');43const i = Number(iStr);44if (isNaN(Number(i))) {45throw new Error(`malformed line ${line}`);46}47if (i !== terms.length) {48throw new Error('non-monotonic index');49}5051terms.push(Buffer.from(base64, 'base64'));52}5354const output: Uint8Array[] = [];5556for (const term of terms) {57output.push(writeVariableLengthQuantity(term.length).buffer);58output.push(term);59}6061await mkdir(path.dirname(outputFile), { recursive: true });62await writeFile(outputFile, Buffer.concat(output));63assertOk(outputFile, terms);64}6566function assertOk(outputFile: string, terms: Buffer[]) {67const parsed = parseTikTokenBinary(outputFile);68const actual: string[] = [];6970for (const [term, index] of parsed) {71actual[index] = Buffer.from(term).toString('base64');72}7374assert.deepStrictEqual(actual, terms.map(t => t.toString('base64')));75}767778