CoCalc -- geminiMessageConverter.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/extension/byok/common/geminiMessageConverter.ts
¹³³⁹⁹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5
import type { Content, FunctionCall, FunctionResponse, Part } from '@google/genai';
6
import { Raw } from '@vscode/prompt-tsx';
7
import type { LanguageModelChatMessage } from 'vscode';
8
import { CustomDataPartMimeTypes } from '../../../platform/endpoint/common/endpointTypes';
9
import { LanguageModelChatMessageRole, LanguageModelDataPart, LanguageModelTextPart, LanguageModelThinkingPart, LanguageModelToolCallPart, LanguageModelToolResultPart, LanguageModelToolResultPart2 } from '../../../vscodeTypes';
10

11
function apiContentToGeminiContent(content: (LanguageModelTextPart | LanguageModelToolResultPart | LanguageModelToolCallPart | LanguageModelDataPart | LanguageModelThinkingPart)[]): Part[] {
12
	const convertedContent: Part[] = [];
13
	let pendingSignature: string | undefined;
14

15
	for (const part of content) {
16
		if (part instanceof LanguageModelThinkingPart) {
17
			// Extract thought signature from thinking part metadata
18
			if (part.metadata && typeof part.metadata === 'object' && 'signature' in part.metadata) {
19
				const metadataObj = part.metadata as Record<string, unknown>;
20
				if (typeof metadataObj.signature === 'string') {
21
					pendingSignature = metadataObj.signature;
22
				}
23
			}
24
			// Note: We don't emit thinking content to Gemini as it's already been processed
25
			// The signature will be attached to the next function call
26
		} else if (part instanceof LanguageModelToolCallPart) {
27
			const functionCallPart: Part = {
28
				functionCall: {
29
					name: part.name,
30
					args: part.input as Record<string, unknown> || {}
31
				},
32
				// Attach pending thought signature if available (required by Gemini 3 for function calling)
33
				...(pendingSignature ? { thoughtSignature: pendingSignature } : {})
34
			};
35

36
			if (pendingSignature) {
37
				pendingSignature = undefined; // Clear after use
38
			}
39

40
			convertedContent.push(functionCallPart);
41
		} else if (part instanceof LanguageModelDataPart) {
42
			if (part.mimeType !== CustomDataPartMimeTypes.StatefulMarker && part.mimeType !== CustomDataPartMimeTypes.CacheControl) {
43
				convertedContent.push({
44
					inlineData: {
45
						data: Buffer.from(part.data).toString('base64'),
46
						mimeType: part.mimeType
47
					}
48
				});
49
			}
50
		} else if (part instanceof LanguageModelToolResultPart || part instanceof LanguageModelToolResultPart2) {
51
			// Convert tool result content - handle both text and image parts
52
			const textContent = part.content
53
				.filter((p): p is LanguageModelTextPart => p instanceof LanguageModelTextPart)
54
				.map(p => p.value)
55
				.join('');
56

57
			// Handle image parts in tool results
58
			const imageParts = part.content.filter((p): p is LanguageModelDataPart =>
59
				p instanceof LanguageModelDataPart &&
60
				p.mimeType !== CustomDataPartMimeTypes.StatefulMarker &&
61
				p.mimeType !== CustomDataPartMimeTypes.CacheControl
62
			);
63

64
			// If there are images, we need to handle them differently
65
			// For now, we'll include image info in the text response since Gemini function responses expect structured data
66
			let imageDescription = '';
67
			if (imageParts.length > 0) {
68
				imageDescription = `\n[Contains ${imageParts.length} image(s) with types: ${imageParts.map(p => p.mimeType).join(', ')}]`;
69
			}
70

71
			// extraction: functionName_timestamp => split on first underscore
72
			const functionName = part.callId?.split('_')[0] || 'unknown_function';
73

74
			// Preserve structured JSON if possible
75
			let responsePayload: any = {};
76
			if (textContent) {
77
				// Handle case with text content (may also have images)
78
				try {
79
					responsePayload = JSON.parse(textContent);
80
					if (typeof responsePayload !== 'object' || responsePayload === null || Array.isArray(responsePayload)) {
81
						responsePayload = { result: responsePayload };
82
					}
83
				} catch {
84
					responsePayload = { result: textContent + imageDescription };
85
				}
86
				// Add image info if present
87
				if (imageParts.length > 0) {
88
					responsePayload.images = imageParts.map(p => ({
89
						mimeType: p.mimeType,
90
						size: p.data.length,
91
						data: Buffer.from(p.data).toString('base64')
92
					}));
93
				}
94
			} else if (imageParts.length > 0) {
95
				// Only images, no text content
96
				responsePayload = {
97
					images: imageParts.map(p => ({
98
						mimeType: p.mimeType,
99
						size: p.data.length,
100
						data: Buffer.from(p.data).toString('base64')
101
					}))
102
				};
103
			}
104

105
			const functionResponse: FunctionResponse = {
106
				name: functionName,
107
				response: responsePayload
108
			};
109

110
			convertedContent.push({ functionResponse });
111
		} else if (part instanceof LanguageModelTextPart) {
112
			// Text content - only filter completely empty strings, keep whitespace
113
			if (part.value !== '') {
114
				convertedContent.push({
115
					text: part.value
116
				});
117
			}
118
		}
119
	}
120
	return convertedContent;
121
}
122

123
export function apiMessageToGeminiMessage(messages: LanguageModelChatMessage[]): { contents: Content[]; systemInstruction?: Content } {
124
	const contents: Content[] = [];
125
	let systemInstruction: Content | undefined;
126

127
	// Track tool calls to match with their responses
128
	const pendingToolCalls = new Map<string, FunctionCall>();
129

130
	for (const message of messages) {
131
		if (message.role === LanguageModelChatMessageRole.System) {
132
			// Gemini uses system instruction separately
133
			const systemText = message.content
134
				.filter((p): p is LanguageModelTextPart => p instanceof LanguageModelTextPart)
135
				.map(p => p.value)
136
				.join('');
137

138
			if (systemText.trim()) {
139
				systemInstruction = {
140
					role: 'user',
141
					parts: [{ text: systemText }]
142
				};
143
			}
144
		} else if (message.role === LanguageModelChatMessageRole.Assistant) {
145
			const parts = apiContentToGeminiContent(message.content);
146

147
			// Store function calls for later matching with responses
148
			parts.forEach(part => {
149
				if (part.functionCall && part.functionCall.name) {
150
					pendingToolCalls.set(part.functionCall.name, part.functionCall);
151
				}
152
			});
153

154
			contents.push({
155
				role: 'model',
156
				parts
157
			});
158
		} else if (message.role === LanguageModelChatMessageRole.User) {
159
			const parts = apiContentToGeminiContent(message.content);
160

161
			contents.push({
162
				role: 'user',
163
				parts
164
			});
165
		}
166
	}
167

168
	// Post-process: ensure functionResponse parts are not embedded in 'model' role messages.
169
	// Gemini expects tool responses to be supplied by the *user*/caller after the model issues a functionCall.
170
	// If upstream accidentally placed tool result parts inside an assistant/model role, we split them out here.
171
	for (let i = 0; i < contents.length; i++) {
172
		const c = contents[i];
173
		if (c.role === 'model' && c.parts && c.parts.some(p => 'functionResponse' in p)) {
174
			const modelParts: Part[] = [];
175
			const toolResultParts: Part[] = [];
176
			for (const p of c.parts) {
177
				if ('functionResponse' in p) {
178
					toolResultParts.push(p);
179
				} else {
180
					modelParts.push(p);
181
				}
182
			}
183
			// Replace original with model-only parts
184
			c.parts = modelParts;
185
			// Insert a new user role content immediately after with the function responses
186
			if (toolResultParts.length) {
187
				contents.splice(i + 1, 0, { role: 'user', parts: toolResultParts });
188
				i++; // Skip over inserted element
189
			}
190
		}
191
	}
192
	// Cleanup: remove any model messages that became empty after extraction
193
	for (let i = contents.length - 1; i >= 0; i--) {
194
		const c = contents[i];
195
		if (c.role === 'model' && (!c.parts || c.parts.length === 0)) {
196
			contents.splice(i, 1);
197
		}
198
	}
199

200
	return { contents, systemInstruction };
201
}
202

203
export function geminiMessagesToRawMessagesForLogging(contents: Content[], systemInstruction?: Content): Raw.ChatMessage[] {
204
	const fullMessages = geminiMessagesToRawMessages(contents, systemInstruction);
205

206
	// Replace bulky content with placeholders for logging
207
	return fullMessages.map(message => {
208
		const content = message.content.map(part => {
209
			if (part.type === Raw.ChatCompletionContentPartKind.Image) {
210
				return {
211
					...part,
212
					imageUrl: { url: '(image)' }
213
				};
214
			}
215
			return part;
216
		});
217

218
		if (message.role === Raw.ChatRole.Tool) {
219
			return {
220
				...message,
221
				content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: '(tool result)' }]
222
			};
223
		}
224

225
		return {
226
			...message,
227
			content
228
		};
229
	});
230
}
231

232
export function geminiMessagesToRawMessages(contents: Content[], systemInstruction?: Content): Raw.ChatMessage[] {
233
	const rawMessages: Raw.ChatMessage[] = [];
234

235
	// Add system instruction if present
236
	if (systemInstruction && systemInstruction.parts) {
237
		const systemContent: Raw.ChatCompletionContentPart[] = [];
238
		systemInstruction.parts.forEach((part: Part) => {
239
			if (part.text) {
240
				systemContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: part.text });
241
			}
242
		});
243
		if (systemContent.length) {
244
			rawMessages.push({ role: Raw.ChatRole.System, content: systemContent });
245
		}
246
	}
247

248
	// Convert Gemini contents to raw messages
249
	for (const content of contents) {
250
		const messageParts: Raw.ChatCompletionContentPart[] = [];
251
		let toolCalls: Raw.ChatMessageToolCall[] | undefined;
252

253
		if (content.parts) {
254
			content.parts.forEach((part: Part) => {
255
				if (part.text) {
256
					messageParts.push({ type: Raw.ChatCompletionContentPartKind.Text, text: part.text });
257
				} else if (part.inlineData) {
258
					messageParts.push({
259
						type: Raw.ChatCompletionContentPartKind.Image,
260
						imageUrl: { url: `data:${part.inlineData.mimeType};base64,${part.inlineData.data}` }
261
					});
262
				} else if (part.functionCall && part.functionCall.name) {
263
					toolCalls ??= [];
264
					toolCalls.push({
265
						id: part.functionCall.name, // Gemini doesn't have call IDs, use name
266
						type: 'function',
267
						function: {
268
							name: part.functionCall.name,
269
							arguments: JSON.stringify(part.functionCall.args ?? {})
270
						}
271
					});
272
				} else if (part.functionResponse && part.functionResponse.name) {
273
					// Function responses should be emitted as tool messages
274
					const toolContent: Raw.ChatCompletionContentPart[] = [];
275

276
					// Handle structured response that might contain image data
277
					const response = part.functionResponse.response;
278
					if (response && typeof response === 'object' && 'images' in response && Array.isArray(response.images)) {
279
						// Extract images from structured response and convert to Raw format
280
						for (const img of response.images) {
281
							if (img && typeof img === 'object' && 'data' in img && 'mimeType' in img) {
282
								toolContent.push({
283
									type: Raw.ChatCompletionContentPartKind.Image,
284
									imageUrl: { url: `data:${img.mimeType};base64,${img.data}` }
285
								});
286
							}
287
						}
288

289
						// Create a clean response object without the raw image data for text content
290
						const cleanResponse = { ...response };
291
						if ('images' in cleanResponse) {
292
							cleanResponse.images = response.images.map((img: any) => ({
293
								mimeType: img.mimeType,
294
								size: img.size || (img.data ? img.data.length : 0)
295
							}));
296
						}
297
						toolContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: JSON.stringify(cleanResponse) });
298
					} else {
299
						// Standard text-only response
300
						toolContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: JSON.stringify(response) });
301
					}
302

303
					rawMessages.push({
304
						role: Raw.ChatRole.Tool,
305
						content: toolContent,
306
						toolCallId: part.functionResponse.name
307
					});
308
				}
309
			});
310
		}
311

312
		// Add the main message if it has content
313
		if (messageParts.length > 0 || toolCalls) {
314
			const role = content.role === 'model' ? Raw.ChatRole.Assistant : Raw.ChatRole.User;
315
			const msg: Raw.ChatMessage = { role, content: messageParts };
316

317
			if (toolCalls && content.role === 'model') {
318
				(msg as Raw.AssistantChatMessage).toolCalls = toolCalls;
319
			}
320

321
			rawMessages.push(msg);
322
		}
323
	}
324

325
	return rawMessages;
326
}
327
Product

Resources

Company