Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/endpoint/vscode-node/extChatTokenizer.ts
13401 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { OutputMode, Raw } from '@vscode/prompt-tsx';
7
import { LanguageModelChat, LanguageModelChatTool } from 'vscode';
8
import { ITokenizer } from '../../../util/common/tokenizer';
9
import { assertNever } from '../../../util/vs/base/common/assert';
10
import { calculateImageTokenCost, estimateDocumentTokenCost } from '../../tokenizer/node/tokenizer';
11
import { convertToApiChatMessage } from './extChatEndpoint';
12
13
/**
14
* BaseTokensPerCompletion is the minimum tokens for a completion request.
15
* Replies are primed with <|im_start|>assistant<|message|>, so these tokens represent the
16
* special token and the role name.
17
*/
18
const BaseTokensPerCompletion = 3;
19
20
/*
21
* Each GPT 3.5 / GPT 4 message comes with 3 tokens per message due to special characters
22
*/
23
const BaseTokensPerMessage = 3;
24
25
26
export class ExtensionContributedChatTokenizer implements ITokenizer {
27
public readonly mode = OutputMode.Raw;
28
29
constructor(private readonly languageModel: LanguageModelChat) { }
30
31
async tokenLength(text: string | Raw.ChatCompletionContentPart): Promise<number> {
32
if (typeof text === 'string') {
33
return this._textTokenLength(text);
34
}
35
36
switch (text.type) {
37
case Raw.ChatCompletionContentPartKind.Text:
38
return this._textTokenLength(text.text);
39
case Raw.ChatCompletionContentPartKind.Opaque:
40
return text.tokenUsage || 0;
41
case Raw.ChatCompletionContentPartKind.Image:
42
if (text.imageUrl.url.startsWith('data:image/')) {
43
try {
44
return calculateImageTokenCost(text.imageUrl.url, text.imageUrl.detail);
45
} catch {
46
return this._textTokenLength(text.imageUrl.url);
47
}
48
}
49
return this._textTokenLength(text.imageUrl.url);
50
case Raw.ChatCompletionContentPartKind.CacheBreakpoint:
51
return 0;
52
case Raw.ChatCompletionContentPartKind.Document:
53
return estimateDocumentTokenCost(text.documentData.data);
54
default:
55
assertNever(text, `unknown content part (${JSON.stringify(text)})`);
56
}
57
}
58
59
private async _textTokenLength(text: string): Promise<number> {
60
if (!text) {
61
return 0;
62
}
63
// Use the VS Code language model API to count tokens
64
return this.languageModel.countTokens(text);
65
}
66
67
async countMessageTokens(message: Raw.ChatMessage): Promise<number> {
68
// Convert to VS Code message format and use the language model's countTokens
69
const apiMessages = convertToApiChatMessage([message]);
70
if (apiMessages.length === 0) {
71
return 0;
72
}
73
74
// Count tokens for the message using VS Code API
75
const messageTokens = await this.languageModel.countTokens(apiMessages[0]);
76
return BaseTokensPerMessage + messageTokens;
77
}
78
79
async countMessagesTokens(messages: Raw.ChatMessage[]): Promise<number> {
80
let numTokens = BaseTokensPerCompletion;
81
for (const message of messages) {
82
numTokens += await this.countMessageTokens(message);
83
}
84
return numTokens;
85
}
86
87
async countToolTokens(tools: readonly LanguageModelChatTool[]): Promise<number> {
88
const baseToolTokens = 16;
89
let numTokens = 0;
90
if (tools.length) {
91
numTokens += baseToolTokens;
92
}
93
94
const baseTokensPerTool = 8;
95
for (const tool of tools) {
96
numTokens += baseTokensPerTool;
97
numTokens += await this._countObjectTokens({ name: tool.name, description: tool.description, parameters: tool.inputSchema });
98
}
99
100
// This is an estimate, so give a little safety margin
101
return Math.floor(numTokens * 1.1);
102
}
103
104
private async _countObjectTokens(obj: Record<string, unknown>): Promise<number> {
105
let numTokens = 0;
106
for (const [key, value] of Object.entries(obj)) {
107
if (!value) {
108
continue;
109
}
110
111
numTokens += await this._textTokenLength(key);
112
if (typeof value === 'string') {
113
numTokens += await this._textTokenLength(value);
114
} else if (typeof value === 'object') {
115
numTokens += await this._countObjectTokens(value as Record<string, unknown>);
116
}
117
}
118
119
return numTokens;
120
}
121
}
122