Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/script/build/compressTikToken.ts
13389 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import * as assert from 'assert';
7
import { mkdir, readFile, writeFile } from 'fs/promises';
8
import * as path from 'path';
9
import { parseTikTokenBinary } from '../../src/platform/tokenizer/node/parseTikTokens';
10
import { writeVariableLengthQuantity } from '../../src/util/common/variableLengthQuantity';
11
12
/**
13
* Compresses a `.tiktoken` file into a much more compact binary format.
14
*
15
* A tiktoken file is a list of base64 encoded terms, followed by a space
16
* and (rather unnecessarily) by their index, like
17
* ```
18
* IQ== 0
19
* Ig== 1
20
* Iw== 2
21
* JA== 3
22
* JQ== 4
23
* Jg== 5
24
* Jw== 6
25
* KA== 7
26
* ```
27
*
28
* This compression takes advantage of the fact that term lengths increase
29
* monotonically with their index. Each term is represented by a VLQ-encoded
30
* length followed by the term itself.
31
*
32
* I explored doing a fancier format with "runs" of certain lengths, however
33
* the difference was only a byte or two in exchange for much higher complexity.
34
*/
35
export async function compressTikToken(inputFile: string, outputFile: string) {
36
const raw = await readFile(inputFile, 'utf-8');
37
const terms: Buffer[] = [];
38
for (const line of raw.split('\n')) {
39
if (!line) {
40
continue;
41
}
42
43
const [base64, iStr] = line.split(' ');
44
const i = Number(iStr);
45
if (isNaN(Number(i))) {
46
throw new Error(`malformed line ${line}`);
47
}
48
if (i !== terms.length) {
49
throw new Error('non-monotonic index');
50
}
51
52
terms.push(Buffer.from(base64, 'base64'));
53
}
54
55
const output: Uint8Array[] = [];
56
57
for (const term of terms) {
58
output.push(writeVariableLengthQuantity(term.length).buffer);
59
output.push(term);
60
}
61
62
await mkdir(path.dirname(outputFile), { recursive: true });
63
await writeFile(outputFile, Buffer.concat(output));
64
assertOk(outputFile, terms);
65
}
66
67
function assertOk(outputFile: string, terms: Buffer[]) {
68
const parsed = parseTikTokenBinary(outputFile);
69
const actual: string[] = [];
70
71
for (const [term, index] of parsed) {
72
actual[index] = Buffer.from(term).toString('base64');
73
}
74
75
assert.deepStrictEqual(actual, terms.map(t => t.toString('base64')));
76
}
77
78