Path: blob/main/extensions/copilot/src/platform/endpoint/vscode-node/test/extChatTokenizer.spec.ts
13405 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { Raw } from '@vscode/prompt-tsx';6import { beforeEach, describe, expect, it, vi } from 'vitest';7import type { LanguageModelChat, LanguageModelChatMessage, LanguageModelChatMessage2 } from 'vscode';8import { ExtensionContributedChatTokenizer } from '../extChatTokenizer';910/**11* Mock implementation of LanguageModelChat for testing purposes.12* Simulates token counting with a configurable strategy.13*/14class MockLanguageModelChat implements Partial<LanguageModelChat> {15private readonly _tokenCountFn: (input: string | LanguageModelChatMessage | LanguageModelChatMessage2) => number;1617constructor(tokenCountFn?: (input: string | LanguageModelChatMessage | LanguageModelChatMessage2) => number) {18// Default: approximate token count as words (split by whitespace)19this._tokenCountFn = tokenCountFn ?? ((input) => {20if (typeof input === 'string') {21return input.split(/\s+/).filter(Boolean).length || 0;22}23// For messages, count tokens in all text content parts24let total = 0;25for (const part of input.content) {26if ('value' in part && typeof part.value === 'string') {27total += part.value.split(/\s+/).filter(Boolean).length || 0;28}29}30return total;31});32}3334countTokens(input: string | LanguageModelChatMessage | LanguageModelChatMessage2): Thenable<number> {35return Promise.resolve(this._tokenCountFn(input));36}37}3839describe('ExtensionContributedChatTokenizer', () => {40let tokenizer: ExtensionContributedChatTokenizer;41let mockLanguageModel: MockLanguageModelChat;4243beforeEach(() => {44mockLanguageModel = new MockLanguageModelChat();45tokenizer = new ExtensionContributedChatTokenizer(mockLanguageModel as unknown as LanguageModelChat);46});4748describe('tokenLength', () => {49it('should count tokens for a simple string', async () => {50const result = await tokenizer.tokenLength('Hello world');51expect(result).toBe(2); // "Hello" and "world"52});5354it('should return 0 for an empty string', async () => {55const result = await tokenizer.tokenLength('');56expect(result).toBe(0);57});5859it('should count tokens for a text content part', async () => {60const textPart: Raw.ChatCompletionContentPart = {61type: Raw.ChatCompletionContentPartKind.Text,62text: 'This is a test message'63};64const result = await tokenizer.tokenLength(textPart);65expect(result).toBe(5); // 5 words66});6768it('should return tokenUsage for opaque content parts', async () => {69const opaquePart: Raw.ChatCompletionContentPart = {70type: Raw.ChatCompletionContentPartKind.Opaque,71value: { some: 'data' },72tokenUsage: 4273};74const result = await tokenizer.tokenLength(opaquePart);75expect(result).toBe(42);76});7778it('should return 0 for opaque content parts without tokenUsage', async () => {79const opaquePart: Raw.ChatCompletionContentPart = {80type: Raw.ChatCompletionContentPartKind.Opaque,81value: { some: 'data' }82};83const result = await tokenizer.tokenLength(opaquePart);84expect(result).toBe(0);85});8687it('should return 0 for cache breakpoint content parts', async () => {88const cacheBreakpoint: Raw.ChatCompletionContentPart = {89type: Raw.ChatCompletionContentPartKind.CacheBreakpoint90};91const result = await tokenizer.tokenLength(cacheBreakpoint);92expect(result).toBe(0);93});9495it('should count tokens for document content parts', async () => {96const documentPart: Raw.ChatCompletionContentPart = {97type: Raw.ChatCompletionContentPartKind.Document,98documentData: { data: 'JVBERi0xLjQK base64 encoded pdf data', mediaType: 'application/pdf' },99};100const result = await tokenizer.tokenLength(documentPart);101// Token length for documents is estimated from document size; it should be positive.102expect(result).toBeGreaterThan(0);103});104});105106describe('countMessageTokens', () => {107it('should count tokens for a user message', async () => {108const message: Raw.ChatMessage = {109role: Raw.ChatRole.User,110content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hello there' }]111};112const result = await tokenizer.countMessageTokens(message);113// BaseTokensPerMessage (3) + message content tokens114expect(result).toBeGreaterThanOrEqual(3);115});116117it('should count tokens for an assistant message', async () => {118const message: Raw.ChatMessage = {119role: Raw.ChatRole.Assistant,120content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'I can help with that' }]121};122const result = await tokenizer.countMessageTokens(message);123expect(result).toBeGreaterThanOrEqual(3);124});125126it('should count tokens for a system message', async () => {127const message: Raw.ChatMessage = {128role: Raw.ChatRole.System,129content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'You are a helpful assistant' }]130};131const result = await tokenizer.countMessageTokens(message);132expect(result).toBeGreaterThanOrEqual(3);133});134});135136describe('countMessagesTokens', () => {137it('should count tokens for multiple messages', async () => {138const messages: Raw.ChatMessage[] = [139{140role: Raw.ChatRole.System,141content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'You are helpful' }]142},143{144role: Raw.ChatRole.User,145content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hi' }]146},147{148role: Raw.ChatRole.Assistant,149content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hello' }]150}151];152const result = await tokenizer.countMessagesTokens(messages);153// BaseTokensPerCompletion (3) + 3 messages * BaseTokensPerMessage (3) + content tokens154expect(result).toBeGreaterThanOrEqual(12);155});156157it('should return base tokens for empty messages array', async () => {158const result = await tokenizer.countMessagesTokens([]);159expect(result).toBe(3); // BaseTokensPerCompletion160});161});162163describe('countToolTokens', () => {164it('should count tokens for a single tool', async () => {165const tools = [{166name: 'get_weather',167description: 'Get the current weather',168inputSchema: {169type: 'object',170properties: {171location: { type: 'string' }172}173}174}];175const result = await tokenizer.countToolTokens(tools);176// baseToolTokens (16) + baseTokensPerTool (8) + object tokens * 1.1177expect(result).toBeGreaterThan(24);178});179180it('should count tokens for multiple tools', async () => {181const tools = [182{183name: 'get_weather',184description: 'Get weather info',185inputSchema: { type: 'object' }186},187{188name: 'search',189description: 'Search the web',190inputSchema: { type: 'object' }191}192];193const result = await tokenizer.countToolTokens(tools);194// baseToolTokens (16) + 2 * baseTokensPerTool (8) + object tokens195expect(result).toBeGreaterThan(32);196});197198it('should return 0 for empty tools array', async () => {199const result = await tokenizer.countToolTokens([]);200expect(result).toBe(0);201});202});203204describe('with custom token counting', () => {205it('should use the language model countTokens method', async () => {206const countTokensSpy = vi.fn().mockResolvedValue(10);207const customMock = {208countTokens: countTokensSpy209} as unknown as LanguageModelChat;210211const customTokenizer = new ExtensionContributedChatTokenizer(customMock);212const result = await customTokenizer.tokenLength('test string');213214expect(countTokensSpy).toHaveBeenCalledWith('test string');215expect(result).toBe(10);216});217218it('should delegate message token counting to language model', async () => {219const countTokensSpy = vi.fn().mockResolvedValue(15);220const customMock = {221countTokens: countTokensSpy222} as unknown as LanguageModelChat;223224const customTokenizer = new ExtensionContributedChatTokenizer(customMock);225const message: Raw.ChatMessage = {226role: Raw.ChatRole.User,227content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: 'Hello' }]228};229230const result = await customTokenizer.countMessageTokens(message);231// BaseTokensPerMessage (3) + 15 from language model232expect(result).toBe(18);233expect(countTokensSpy).toHaveBeenCalled();234});235});236});237238239