Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/endpoint/vscode-node/test/extChatTokenizer.spec.ts
13405 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { Raw } from '@vscode/prompt-tsx';
7
import { beforeEach, describe, expect, it, vi } from 'vitest';
8
import type { LanguageModelChat, LanguageModelChatMessage, LanguageModelChatMessage2 } from 'vscode';
9
import { ExtensionContributedChatTokenizer } from '../extChatTokenizer';
10
11
/**
12
* Mock implementation of LanguageModelChat for testing purposes.
13
* Simulates token counting with a configurable strategy.
14
*/
15
class MockLanguageModelChat implements Partial<LanguageModelChat> {
16
private readonly _tokenCountFn: (input: string | LanguageModelChatMessage | LanguageModelChatMessage2) => number;
17
18
constructor(tokenCountFn?: (input: string | LanguageModelChatMessage | LanguageModelChatMessage2) => number) {
19
// Default: approximate token count as words (split by whitespace)
20
this._tokenCountFn = tokenCountFn ?? ((input) => {
21
if (typeof input === 'string') {
22
return input.split(/\s+/).filter(Boolean).length || 0;
23
}
24
// For messages, count tokens in all text content parts
25
let total = 0;
26
for (const part of input.content) {
27
if ('value' in part && typeof part.value === 'string') {
28
total += part.value.split(/\s+/).filter(Boolean).length || 0;
29
}
30
}
31
return total;
32
});
33
}
34
35
countTokens(input: string | LanguageModelChatMessage | LanguageModelChatMessage2): Thenable<number> {
36
return Promise.resolve(this._tokenCountFn(input));
37
}
38
}
39
40
describe('ExtensionContributedChatTokenizer', () => {
41
let tokenizer: ExtensionContributedChatTokenizer;
42
let mockLanguageModel: MockLanguageModelChat;
43
44
beforeEach(() => {
45
mockLanguageModel = new MockLanguageModelChat();
46
tokenizer = new ExtensionContributedChatTokenizer(mockLanguageModel as unknown as LanguageModelChat);
47
});
48
49
describe('tokenLength', () => {
50
it('should count tokens for a simple string', async () => {
51
const result = await tokenizer.tokenLength('Hello world');
52
expect(result).toBe(2); // "Hello" and "world"
53
});
54
55
it('should return 0 for an empty string', async () => {
56
const result = await tokenizer.tokenLength('');
57
expect(result).toBe(0);
58
});
59
60
it('should count tokens for a text content part', async () => {
61
const textPart: Raw.ChatCompletionContentPart = {
62
type: Raw.ChatCompletionContentPartKind.Text,
63
text: 'This is a test message'
64
};
65
const result = await tokenizer.tokenLength(textPart);
66
expect(result).toBe(5); // 5 words
67
});
68
69
it('should return tokenUsage for opaque content parts', async () => {
70
const opaquePart: Raw.ChatCompletionContentPart = {
71
type: Raw.ChatCompletionContentPartKind.Opaque,
72
value: { some: 'data' },
73
tokenUsage: 42
74
};
75
const result = await tokenizer.tokenLength(opaquePart);
76
expect(result).toBe(42);
77
});
78
79
it('should return 0 for opaque content parts without tokenUsage', async () => {
80
const opaquePart: Raw.ChatCompletionContentPart = {
81
type: Raw.ChatCompletionContentPartKind.Opaque,
82
value: { some: 'data' }
83
};
84
const result = await tokenizer.tokenLength(opaquePart);
85
expect(result).toBe(0);
86
});
87
88
it('should return 0 for cache breakpoint content parts', async () => {
89
const cacheBreakpoint: Raw.ChatCompletionContentPart = {
90
type: Raw.ChatCompletionContentPartKind.CacheBreakpoint
91
};
92
const result = await tokenizer.tokenLength(cacheBreakpoint);
93
expect(result).toBe(0);
94
});
95
96
it('should count tokens for document content parts', async () => {
97
const documentPart: Raw.ChatCompletionContentPart = {
98
type: Raw.ChatCompletionContentPartKind.Document,
99
documentData: { data: 'JVBERi0xLjQK base64 encoded pdf data', mediaType: 'application/pdf' },
100
};
101
const result = await tokenizer.tokenLength(documentPart);
102
// Token length for documents is estimated from document size; it should be positive.
103
expect(result).toBeGreaterThan(0);
104
});
105
});
106
107
describe('countMessageTokens', () => {
108
it('should count tokens for a user message', async () => {
109
const message: Raw.ChatMessage = {
110
role: Raw.ChatRole.User,
111
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hello there' }]
112
};
113
const result = await tokenizer.countMessageTokens(message);
114
// BaseTokensPerMessage (3) + message content tokens
115
expect(result).toBeGreaterThanOrEqual(3);
116
});
117
118
it('should count tokens for an assistant message', async () => {
119
const message: Raw.ChatMessage = {
120
role: Raw.ChatRole.Assistant,
121
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'I can help with that' }]
122
};
123
const result = await tokenizer.countMessageTokens(message);
124
expect(result).toBeGreaterThanOrEqual(3);
125
});
126
127
it('should count tokens for a system message', async () => {
128
const message: Raw.ChatMessage = {
129
role: Raw.ChatRole.System,
130
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'You are a helpful assistant' }]
131
};
132
const result = await tokenizer.countMessageTokens(message);
133
expect(result).toBeGreaterThanOrEqual(3);
134
});
135
});
136
137
describe('countMessagesTokens', () => {
138
it('should count tokens for multiple messages', async () => {
139
const messages: Raw.ChatMessage[] = [
140
{
141
role: Raw.ChatRole.System,
142
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'You are helpful' }]
143
},
144
{
145
role: Raw.ChatRole.User,
146
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hi' }]
147
},
148
{
149
role: Raw.ChatRole.Assistant,
150
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hello' }]
151
}
152
];
153
const result = await tokenizer.countMessagesTokens(messages);
154
// BaseTokensPerCompletion (3) + 3 messages * BaseTokensPerMessage (3) + content tokens
155
expect(result).toBeGreaterThanOrEqual(12);
156
});
157
158
it('should return base tokens for empty messages array', async () => {
159
const result = await tokenizer.countMessagesTokens([]);
160
expect(result).toBe(3); // BaseTokensPerCompletion
161
});
162
});
163
164
describe('countToolTokens', () => {
165
it('should count tokens for a single tool', async () => {
166
const tools = [{
167
name: 'get_weather',
168
description: 'Get the current weather',
169
inputSchema: {
170
type: 'object',
171
properties: {
172
location: { type: 'string' }
173
}
174
}
175
}];
176
const result = await tokenizer.countToolTokens(tools);
177
// baseToolTokens (16) + baseTokensPerTool (8) + object tokens * 1.1
178
expect(result).toBeGreaterThan(24);
179
});
180
181
it('should count tokens for multiple tools', async () => {
182
const tools = [
183
{
184
name: 'get_weather',
185
description: 'Get weather info',
186
inputSchema: { type: 'object' }
187
},
188
{
189
name: 'search',
190
description: 'Search the web',
191
inputSchema: { type: 'object' }
192
}
193
];
194
const result = await tokenizer.countToolTokens(tools);
195
// baseToolTokens (16) + 2 * baseTokensPerTool (8) + object tokens
196
expect(result).toBeGreaterThan(32);
197
});
198
199
it('should return 0 for empty tools array', async () => {
200
const result = await tokenizer.countToolTokens([]);
201
expect(result).toBe(0);
202
});
203
});
204
205
describe('with custom token counting', () => {
206
it('should use the language model countTokens method', async () => {
207
const countTokensSpy = vi.fn().mockResolvedValue(10);
208
const customMock = {
209
countTokens: countTokensSpy
210
} as unknown as LanguageModelChat;
211
212
const customTokenizer = new ExtensionContributedChatTokenizer(customMock);
213
const result = await customTokenizer.tokenLength('test string');
214
215
expect(countTokensSpy).toHaveBeenCalledWith('test string');
216
expect(result).toBe(10);
217
});
218
219
it('should delegate message token counting to language model', async () => {
220
const countTokensSpy = vi.fn().mockResolvedValue(15);
221
const customMock = {
222
countTokens: countTokensSpy
223
} as unknown as LanguageModelChat;
224
225
const customTokenizer = new ExtensionContributedChatTokenizer(customMock);
226
const message: Raw.ChatMessage = {
227
role: Raw.ChatRole.User,
228
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hello' }]
229
};
230
231
const result = await customTokenizer.countMessageTokens(message);
232
// BaseTokensPerMessage (3) + 15 from language model
233
expect(result).toBe(18);
234
expect(countTokensSpy).toHaveBeenCalled();
235
});
236
});
237
});
238
239