Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/extension/byok/common/geminiMessageConverter.ts
13399 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
import type { Content, FunctionCall, FunctionResponse, Part } from '@google/genai';
6
import { Raw } from '@vscode/prompt-tsx';
7
import type { LanguageModelChatMessage } from 'vscode';
8
import { CustomDataPartMimeTypes } from '../../../platform/endpoint/common/endpointTypes';
9
import { LanguageModelChatMessageRole, LanguageModelDataPart, LanguageModelTextPart, LanguageModelThinkingPart, LanguageModelToolCallPart, LanguageModelToolResultPart, LanguageModelToolResultPart2 } from '../../../vscodeTypes';
10
11
function apiContentToGeminiContent(content: (LanguageModelTextPart | LanguageModelToolResultPart | LanguageModelToolCallPart | LanguageModelDataPart | LanguageModelThinkingPart)[]): Part[] {
12
const convertedContent: Part[] = [];
13
let pendingSignature: string | undefined;
14
15
for (const part of content) {
16
if (part instanceof LanguageModelThinkingPart) {
17
// Extract thought signature from thinking part metadata
18
if (part.metadata && typeof part.metadata === 'object' && 'signature' in part.metadata) {
19
const metadataObj = part.metadata as Record<string, unknown>;
20
if (typeof metadataObj.signature === 'string') {
21
pendingSignature = metadataObj.signature;
22
}
23
}
24
// Note: We don't emit thinking content to Gemini as it's already been processed
25
// The signature will be attached to the next function call
26
} else if (part instanceof LanguageModelToolCallPart) {
27
const functionCallPart: Part = {
28
functionCall: {
29
name: part.name,
30
args: part.input as Record<string, unknown> || {}
31
},
32
// Attach pending thought signature if available (required by Gemini 3 for function calling)
33
...(pendingSignature ? { thoughtSignature: pendingSignature } : {})
34
};
35
36
if (pendingSignature) {
37
pendingSignature = undefined; // Clear after use
38
}
39
40
convertedContent.push(functionCallPart);
41
} else if (part instanceof LanguageModelDataPart) {
42
if (part.mimeType !== CustomDataPartMimeTypes.StatefulMarker && part.mimeType !== CustomDataPartMimeTypes.CacheControl) {
43
convertedContent.push({
44
inlineData: {
45
data: Buffer.from(part.data).toString('base64'),
46
mimeType: part.mimeType
47
}
48
});
49
}
50
} else if (part instanceof LanguageModelToolResultPart || part instanceof LanguageModelToolResultPart2) {
51
// Convert tool result content - handle both text and image parts
52
const textContent = part.content
53
.filter((p): p is LanguageModelTextPart => p instanceof LanguageModelTextPart)
54
.map(p => p.value)
55
.join('');
56
57
// Handle image parts in tool results
58
const imageParts = part.content.filter((p): p is LanguageModelDataPart =>
59
p instanceof LanguageModelDataPart &&
60
p.mimeType !== CustomDataPartMimeTypes.StatefulMarker &&
61
p.mimeType !== CustomDataPartMimeTypes.CacheControl
62
);
63
64
// If there are images, we need to handle them differently
65
// For now, we'll include image info in the text response since Gemini function responses expect structured data
66
let imageDescription = '';
67
if (imageParts.length > 0) {
68
imageDescription = `\n[Contains ${imageParts.length} image(s) with types: ${imageParts.map(p => p.mimeType).join(', ')}]`;
69
}
70
71
// extraction: functionName_timestamp => split on first underscore
72
const functionName = part.callId?.split('_')[0] || 'unknown_function';
73
74
// Preserve structured JSON if possible
75
let responsePayload: any = {};
76
if (textContent) {
77
// Handle case with text content (may also have images)
78
try {
79
responsePayload = JSON.parse(textContent);
80
if (typeof responsePayload !== 'object' || responsePayload === null || Array.isArray(responsePayload)) {
81
responsePayload = { result: responsePayload };
82
}
83
} catch {
84
responsePayload = { result: textContent + imageDescription };
85
}
86
// Add image info if present
87
if (imageParts.length > 0) {
88
responsePayload.images = imageParts.map(p => ({
89
mimeType: p.mimeType,
90
size: p.data.length,
91
data: Buffer.from(p.data).toString('base64')
92
}));
93
}
94
} else if (imageParts.length > 0) {
95
// Only images, no text content
96
responsePayload = {
97
images: imageParts.map(p => ({
98
mimeType: p.mimeType,
99
size: p.data.length,
100
data: Buffer.from(p.data).toString('base64')
101
}))
102
};
103
}
104
105
const functionResponse: FunctionResponse = {
106
name: functionName,
107
response: responsePayload
108
};
109
110
convertedContent.push({ functionResponse });
111
} else if (part instanceof LanguageModelTextPart) {
112
// Text content - only filter completely empty strings, keep whitespace
113
if (part.value !== '') {
114
convertedContent.push({
115
text: part.value
116
});
117
}
118
}
119
}
120
return convertedContent;
121
}
122
123
export function apiMessageToGeminiMessage(messages: LanguageModelChatMessage[]): { contents: Content[]; systemInstruction?: Content } {
124
const contents: Content[] = [];
125
let systemInstruction: Content | undefined;
126
127
// Track tool calls to match with their responses
128
const pendingToolCalls = new Map<string, FunctionCall>();
129
130
for (const message of messages) {
131
if (message.role === LanguageModelChatMessageRole.System) {
132
// Gemini uses system instruction separately
133
const systemText = message.content
134
.filter((p): p is LanguageModelTextPart => p instanceof LanguageModelTextPart)
135
.map(p => p.value)
136
.join('');
137
138
if (systemText.trim()) {
139
systemInstruction = {
140
role: 'user',
141
parts: [{ text: systemText }]
142
};
143
}
144
} else if (message.role === LanguageModelChatMessageRole.Assistant) {
145
const parts = apiContentToGeminiContent(message.content);
146
147
// Store function calls for later matching with responses
148
parts.forEach(part => {
149
if (part.functionCall && part.functionCall.name) {
150
pendingToolCalls.set(part.functionCall.name, part.functionCall);
151
}
152
});
153
154
contents.push({
155
role: 'model',
156
parts
157
});
158
} else if (message.role === LanguageModelChatMessageRole.User) {
159
const parts = apiContentToGeminiContent(message.content);
160
161
contents.push({
162
role: 'user',
163
parts
164
});
165
}
166
}
167
168
// Post-process: ensure functionResponse parts are not embedded in 'model' role messages.
169
// Gemini expects tool responses to be supplied by the *user*/caller after the model issues a functionCall.
170
// If upstream accidentally placed tool result parts inside an assistant/model role, we split them out here.
171
for (let i = 0; i < contents.length; i++) {
172
const c = contents[i];
173
if (c.role === 'model' && c.parts && c.parts.some(p => 'functionResponse' in p)) {
174
const modelParts: Part[] = [];
175
const toolResultParts: Part[] = [];
176
for (const p of c.parts) {
177
if ('functionResponse' in p) {
178
toolResultParts.push(p);
179
} else {
180
modelParts.push(p);
181
}
182
}
183
// Replace original with model-only parts
184
c.parts = modelParts;
185
// Insert a new user role content immediately after with the function responses
186
if (toolResultParts.length) {
187
contents.splice(i + 1, 0, { role: 'user', parts: toolResultParts });
188
i++; // Skip over inserted element
189
}
190
}
191
}
192
// Cleanup: remove any model messages that became empty after extraction
193
for (let i = contents.length - 1; i >= 0; i--) {
194
const c = contents[i];
195
if (c.role === 'model' && (!c.parts || c.parts.length === 0)) {
196
contents.splice(i, 1);
197
}
198
}
199
200
return { contents, systemInstruction };
201
}
202
203
export function geminiMessagesToRawMessagesForLogging(contents: Content[], systemInstruction?: Content): Raw.ChatMessage[] {
204
const fullMessages = geminiMessagesToRawMessages(contents, systemInstruction);
205
206
// Replace bulky content with placeholders for logging
207
return fullMessages.map(message => {
208
const content = message.content.map(part => {
209
if (part.type === Raw.ChatCompletionContentPartKind.Image) {
210
return {
211
...part,
212
imageUrl: { url: '(image)' }
213
};
214
}
215
return part;
216
});
217
218
if (message.role === Raw.ChatRole.Tool) {
219
return {
220
...message,
221
content: [{ type: Raw.ChatCompletionContentPartKind.Text, text: '(tool result)' }]
222
};
223
}
224
225
return {
226
...message,
227
content
228
};
229
});
230
}
231
232
export function geminiMessagesToRawMessages(contents: Content[], systemInstruction?: Content): Raw.ChatMessage[] {
233
const rawMessages: Raw.ChatMessage[] = [];
234
235
// Add system instruction if present
236
if (systemInstruction && systemInstruction.parts) {
237
const systemContent: Raw.ChatCompletionContentPart[] = [];
238
systemInstruction.parts.forEach((part: Part) => {
239
if (part.text) {
240
systemContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: part.text });
241
}
242
});
243
if (systemContent.length) {
244
rawMessages.push({ role: Raw.ChatRole.System, content: systemContent });
245
}
246
}
247
248
// Convert Gemini contents to raw messages
249
for (const content of contents) {
250
const messageParts: Raw.ChatCompletionContentPart[] = [];
251
let toolCalls: Raw.ChatMessageToolCall[] | undefined;
252
253
if (content.parts) {
254
content.parts.forEach((part: Part) => {
255
if (part.text) {
256
messageParts.push({ type: Raw.ChatCompletionContentPartKind.Text, text: part.text });
257
} else if (part.inlineData) {
258
messageParts.push({
259
type: Raw.ChatCompletionContentPartKind.Image,
260
imageUrl: { url: `data:${part.inlineData.mimeType};base64,${part.inlineData.data}` }
261
});
262
} else if (part.functionCall && part.functionCall.name) {
263
toolCalls ??= [];
264
toolCalls.push({
265
id: part.functionCall.name, // Gemini doesn't have call IDs, use name
266
type: 'function',
267
function: {
268
name: part.functionCall.name,
269
arguments: JSON.stringify(part.functionCall.args ?? {})
270
}
271
});
272
} else if (part.functionResponse && part.functionResponse.name) {
273
// Function responses should be emitted as tool messages
274
const toolContent: Raw.ChatCompletionContentPart[] = [];
275
276
// Handle structured response that might contain image data
277
const response = part.functionResponse.response;
278
if (response && typeof response === 'object' && 'images' in response && Array.isArray(response.images)) {
279
// Extract images from structured response and convert to Raw format
280
for (const img of response.images) {
281
if (img && typeof img === 'object' && 'data' in img && 'mimeType' in img) {
282
toolContent.push({
283
type: Raw.ChatCompletionContentPartKind.Image,
284
imageUrl: { url: `data:${img.mimeType};base64,${img.data}` }
285
});
286
}
287
}
288
289
// Create a clean response object without the raw image data for text content
290
const cleanResponse = { ...response };
291
if ('images' in cleanResponse) {
292
cleanResponse.images = response.images.map((img: any) => ({
293
mimeType: img.mimeType,
294
size: img.size || (img.data ? img.data.length : 0)
295
}));
296
}
297
toolContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: JSON.stringify(cleanResponse) });
298
} else {
299
// Standard text-only response
300
toolContent.push({ type: Raw.ChatCompletionContentPartKind.Text, text: JSON.stringify(response) });
301
}
302
303
rawMessages.push({
304
role: Raw.ChatRole.Tool,
305
content: toolContent,
306
toolCallId: part.functionResponse.name
307
});
308
}
309
});
310
}
311
312
// Add the main message if it has content
313
if (messageParts.length > 0 || toolCalls) {
314
const role = content.role === 'model' ? Raw.ChatRole.Assistant : Raw.ChatRole.User;
315
const msg: Raw.ChatMessage = { role, content: messageParts };
316
317
if (toolCalls && content.role === 'model') {
318
(msg as Raw.AssistantChatMessage).toolCalls = toolCalls;
319
}
320
321
rawMessages.push(msg);
322
}
323
}
324
325
return rawMessages;
326
}
327