Path: blob/main/extensions/copilot/src/extension/byok/common/geminiMessageConverter.ts
13399 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/4import type { Content, FunctionCall, FunctionResponse, Part } from '@google/genai';5import { Raw } from '@vscode/prompt-tsx';6import type { LanguageModelChatMessage } from 'vscode';7import { CustomDataPartMimeTypes } from '../../../platform/endpoint/common/endpointTypes';8import { LanguageModelChatMessageRole, LanguageModelDataPart, LanguageModelTextPart, LanguageModelThinkingPart, LanguageModelToolCallPart, LanguageModelToolResultPart, LanguageModelToolResultPart2 } from '../../../vscodeTypes';910function apiContentToGeminiContent(content: (LanguageModelTextPart | LanguageModelToolResultPart | LanguageModelToolCallPart | LanguageModelDataPart | LanguageModelThinkingPart)[]): Part[] {11const convertedContent: Part[] = [];12let pendingSignature: string | undefined;1314for (const part of content) {15if (part instanceof LanguageModelThinkingPart) {16// Extract thought signature from thinking part metadata17if (part.metadata && typeof part.metadata === 'object' && 'signature' in part.metadata) {18const metadataObj = part.metadata as Record<string, unknown>;19if (typeof metadataObj.signature === 'string') {20pendingSignature = metadataObj.signature;21}22}23// Note: We don't emit thinking content to Gemini as it's already been processed24// The signature will be attached to the next function call25} else if (part instanceof LanguageModelToolCallPart) {26const functionCallPart: Part = {27functionCall: {28name: part.name,29args: part.input as Record<string, unknown> || {}30},31// Attach pending thought signature if available (required by Gemini 3 for function calling)32...(pendingSignature ? { thoughtSignature: pendingSignature } : {})33};3435if (pendingSignature) {36pendingSignature = undefined; // Clear after use37}3839convertedContent.push(functionCallPart);40} else if (part instanceof LanguageModelDataPart) {41if (part.mimeType !== CustomDataPartMimeTypes.StatefulMarker && part.mimeType !== CustomDataPartMimeTypes.CacheControl) {42convertedContent.push({43inlineData: {44data: Buffer.from(part.data).toString('base64'),45mimeType: part.mimeType46}47});48}49} else if (part instanceof LanguageModelToolResultPart || part instanceof LanguageModelToolResultPart2) {50// Convert tool result content - handle both text and image parts51const textContent = part.content52.filter((p): p is LanguageModelTextPart => p instanceof LanguageModelTextPart)53.map(p => p.value)54.join('');5556// Handle image parts in tool results57const imageParts = part.content.filter((p): p is LanguageModelDataPart =>58p instanceof LanguageModelDataPart &&59p.mimeType !== CustomDataPartMimeTypes.StatefulMarker &&60p.mimeType !== CustomDataPartMimeTypes.CacheControl61);6263// If there are images, we need to handle them differently64// For now, we'll include image info in the text response since Gemini function responses expect structured data65let imageDescription = '';66if (imageParts.length > 0) {67imageDescription = `\n[Contains ${imageParts.length} image(s) with types: ${imageParts.map(p => p.mimeType).join(', ')}]`;68}6970// extraction: functionName_timestamp => split on first underscore71const functionName = part.callId?.split('_')[0] || 'unknown_function';7273// Preserve structured JSON if possible74let responsePayload: any = {};75if (textContent) {76// Handle case with text content (may also have images)77try {78responsePayload = JSON.parse(textContent);79if (typeof responsePayload !== 'object' || responsePayload === null || Array.isArray(responsePayload)) {80responsePayload = { result: responsePayload };81}82} catch {83responsePayload = { result: textContent + imageDescription };84}85// Add image info if present86if (imageParts.length > 0) {87responsePayload.images = imageParts.map(p => ({88mimeType: p.mimeType,89size: p.data.length,90data: Buffer.from(p.data).toString('base64')91}));92}93} else if (imageParts.length > 0) {94// Only images, no text content95responsePayload = {96images: imageParts.map(p => ({97mimeType: p.mimeType,98size: p.data.length,99data: Buffer.from(p.data).toString('base64')100}))101};102}103104const functionResponse: FunctionResponse = {105name: functionName,106response: responsePayload107};108109convertedContent.push({ functionResponse });110} else if (part instanceof LanguageModelTextPart) {111// Text content - only filter completely empty strings, keep whitespace112if (part.value !== '') {113convertedContent.push({114text: part.value115});116}117}118}119return convertedContent;120}121122export function apiMessageToGeminiMessage(messages: LanguageModelChatMessage[]): { contents: Content[]; systemInstruction?: Content } {123const contents: Content[] = [];124let systemInstruction: Content | undefined;125126// Track tool calls to match with their responses127const pendingToolCalls = new Map<string, FunctionCall>();128129for (const message of messages) {130if (message.role === LanguageModelChatMessageRole.System) {131// Gemini uses system instruction separately132const systemText = message.content133.filter((p): p is LanguageModelTextPart => p instanceof LanguageModelTextPart)134.map(p => p.value)135.join('');136137if (systemText.trim()) {138systemInstruction = {139role: 'user',140parts: [{ text: systemText }]141};142}143} else if (message.role === LanguageModelChatMessageRole.Assistant) {144const parts = apiContentToGeminiContent(message.content);145146// Store function calls for later matching with responses147parts.forEach(part => {148if (part.functionCall && part.functionCall.name) {149pendingToolCalls.set(part.functionCall.name, part.functionCall);150}151});152153contents.push({154role: 'model',155parts156});157} else if (message.role === LanguageModelChatMessageRole.User) {158const parts = apiContentToGeminiContent(message.content);159160contents.push({161role: 'user',162parts163});164}165}166167// Post-process: ensure functionResponse parts are not embedded in 'model' role messages.168// Gemini expects tool responses to be supplied by the *user*/caller after the model issues a functionCall.169// If upstream accidentally placed tool result parts inside an assistant/model role, we split them out here.170for (let i = 0; i < contents.length; i++) {171const c = contents[i];172if (c.role === 'model' && c.parts && c.parts.some(p => 'functionResponse' in p)) {173const modelParts: Part[] = [];174const toolResultParts: Part[] = [];175for (const p of c.parts) {176if ('functionResponse' in p) {177toolResultParts.push(p);178} else {179modelParts.push(p);180}181}182// Replace original with model-only parts183c.parts = modelParts;184// Insert a new user role content immediately after with the function responses185if (toolResultParts.length) {186contents.splice(i + 1, 0, { role: 'user', parts: toolResultParts });187i++; // Skip over inserted element188}189}190}191// Cleanup: remove any model messages that became empty after extraction192for (let i = contents.length - 1; i >= 0; i--) {193const c = contents[i];194if (c.role === 'model' && (!c.parts || c.parts.length === 0)) {195contents.splice(i, 1);196}197}198199return { contents, systemInstruction };200}201202export function geminiMessagesToRawMessagesForLogging(contents: Content[], systemInstruction?: Content): Raw.ChatMessage[] {203const fullMessages = geminiMessagesToRawMessages(contents, systemInstruction);204205// Replace bulky content with placeholders for logging206return fullMessages.map(message => {207const content = message.content.map(part => {208if (part.type === Raw.ChatCompletionContentPartKind.Image) {209return {210...part,211imageUrl: { url: '(image)' }212};213}214return part;215});216217if (message.role === Raw.ChatRole.Tool) {218return {219...message,220content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: '(tool result)' }]221};222}223224return {225...message,226content227};228});229}230231export function geminiMessagesToRawMessages(contents: Content[], systemInstruction?: Content): Raw.ChatMessage[] {232const rawMessages: Raw.ChatMessage[] = [];233234// Add system instruction if present235if (systemInstruction && systemInstruction.parts) {236const systemContent: Raw.ChatCompletionContentPart[] = [];237systemInstruction.parts.forEach((part: Part) => {238if (part.text) {239systemContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: part.text });240}241});242if (systemContent.length) {243rawMessages.push({ role: Raw.ChatRole.System, content: systemContent });244}245}246247// Convert Gemini contents to raw messages248for (const content of contents) {249const messageParts: Raw.ChatCompletionContentPart[] = [];250let toolCalls: Raw.ChatMessageToolCall[] | undefined;251252if (content.parts) {253content.parts.forEach((part: Part) => {254if (part.text) {255messageParts.push({ type: Raw.ChatCompletionContentPartKind.Text, text: part.text });256} else if (part.inlineData) {257messageParts.push({258type: Raw.ChatCompletionContentPartKind.Image,259imageUrl: { url: `data:${part.inlineData.mimeType};base64,${part.inlineData.data}` }260});261} else if (part.functionCall && part.functionCall.name) {262toolCalls ??= [];263toolCalls.push({264id: part.functionCall.name, // Gemini doesn't have call IDs, use name265type: 'function',266function: {267name: part.functionCall.name,268arguments: JSON.stringify(part.functionCall.args ?? {})269}270});271} else if (part.functionResponse && part.functionResponse.name) {272// Function responses should be emitted as tool messages273const toolContent: Raw.ChatCompletionContentPart[] = [];274275// Handle structured response that might contain image data276const response = part.functionResponse.response;277if (response && typeof response === 'object' && 'images' in response && Array.isArray(response.images)) {278// Extract images from structured response and convert to Raw format279for (const img of response.images) {280if (img && typeof img === 'object' && 'data' in img && 'mimeType' in img) {281toolContent.push({282type: Raw.ChatCompletionContentPartKind.Image,283imageUrl: { url: `data:${img.mimeType};base64,${img.data}` }284});285}286}287288// Create a clean response object without the raw image data for text content289const cleanResponse = { ...response };290if ('images' in cleanResponse) {291cleanResponse.images = response.images.map((img: any) => ({292mimeType: img.mimeType,293size: img.size || (img.data ? img.data.length : 0)294}));295}296toolContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: JSON.stringify(cleanResponse) });297} else {298// Standard text-only response299toolContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: JSON.stringify(response) });300}301302rawMessages.push({303role: Raw.ChatRole.Tool,304content: toolContent,305toolCallId: part.functionResponse.name306});307}308});309}310311// Add the main message if it has content312if (messageParts.length > 0 || toolCalls) {313const role = content.role === 'model' ? Raw.ChatRole.Assistant : Raw.ChatRole.User;314const msg: Raw.ChatMessage = { role, content: messageParts };315316if (toolCalls && content.role === 'model') {317(msg as Raw.AssistantChatMessage).toolCalls = toolCalls;318}319320rawMessages.push(msg);321}322}323324return rawMessages;325}326327