CoCalc -- geminiNativeProvider.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/extension/byok/vscode-node/geminiNativeProvider.ts
¹³³⁹⁹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import { ApiError, GenerateContentParameters, GoogleGenAI, Tool, Type } from '@google/genai';
7
import { CancellationToken, LanguageModelChatInformation, LanguageModelChatMessage, LanguageModelChatMessage2, LanguageModelResponsePart2, LanguageModelTextPart, LanguageModelThinkingPart, LanguageModelToolCallPart, Progress, ProvideLanguageModelChatResponseOptions } from 'vscode';
8
import { ChatFetchResponseType, ChatLocation } from '../../../platform/chat/common/commonTypes';
9
import { ILogService } from '../../../platform/log/common/logService';
10
import { IResponseDelta, OpenAiFunctionTool } from '../../../platform/networking/common/fetch';
11
import { APIUsage } from '../../../platform/networking/common/openai';
12
import { CopilotChatAttr, emitInferenceDetailsEvent, GenAiAttr, GenAiMetrics, GenAiOperationName, GenAiProviderName, type OTelModelOptions, StdAttr, toToolDefinitions, truncateForOTel } from '../../../platform/otel/common/index';
13
import { IOTelService, SpanKind, SpanStatusCode } from '../../../platform/otel/common/otelService';
14
import { IRequestLogger } from '../../../platform/requestLogger/common/requestLogger';
15
import { retrieveCapturingTokenByCorrelation, runWithCapturingToken } from '../../../platform/requestLogger/node/requestLogger';
16
import { ITelemetryService } from '../../../platform/telemetry/common/telemetry';
17
import { toErrorMessage } from '../../../util/common/errorMessage';
18
import { RecordedProgress } from '../../../util/common/progressRecorder';
19
import { generateUuid } from '../../../util/vs/base/common/uuid';
20
import { BYOKKnownModels, byokKnownModelsToAPIInfo, BYOKModelCapabilities, LMResponsePart } from '../common/byokProvider';
21
import { toGeminiFunction as toGeminiFunctionDeclaration, ToolJsonSchema } from '../common/geminiFunctionDeclarationConverter';
22
import { apiMessageToGeminiMessage, geminiMessagesToRawMessagesForLogging } from '../common/geminiMessageConverter';
23
import { AbstractLanguageModelChatProvider, ExtendedLanguageModelChatInformation, LanguageModelChatConfiguration } from './abstractLanguageModelChatProvider';
24
import { IBYOKStorageService } from './byokStorageService';
25

26
export class GeminiNativeBYOKLMProvider extends AbstractLanguageModelChatProvider {
27

28
	public static readonly providerName = 'Gemini';
29

30
	constructor(
31
		knownModels: BYOKKnownModels | undefined,
32
		byokStorageService: IBYOKStorageService,
33
		@ILogService logService: ILogService,
34
		@IRequestLogger private readonly _requestLogger: IRequestLogger,
35
		@ITelemetryService private readonly _telemetryService: ITelemetryService,
36
		@IOTelService private readonly _otelService: IOTelService,
37
	) {
38
		super(GeminiNativeBYOKLMProvider.providerName.toLowerCase(), GeminiNativeBYOKLMProvider.providerName, knownModels, byokStorageService, logService);
39
	}
40

41
	protected async getAllModels(silent: boolean, apiKey: string | undefined): Promise<ExtendedLanguageModelChatInformation<LanguageModelChatConfiguration>[]> {
42
		if (!apiKey && silent) {
43
			return [];
44
		}
45

46
		try {
47
			const client = new GoogleGenAI({ apiKey });
48
			const models = await client.models.list();
49
			const modelList: Record<string, BYOKModelCapabilities> = {};
50

51
			for await (const model of models) {
52
				const modelId = model.name;
53
				if (!modelId) {
54
					continue; // Skip models without names
55
				}
56

57
				// Enable only known models.
58
				if (this._knownModels && this._knownModels[modelId]) {
59
					modelList[modelId] = this._knownModels[modelId];
60
				}
61
			}
62
			return byokKnownModelsToAPIInfo(this._name, modelList);
63
		} catch (e) {
64
			let error: Error;
65
			if (e instanceof ApiError) {
66
				let message = e.message;
67
				try { message = JSON.parse(message).error?.message; } catch { /* ignore */ }
68
				error = new Error(message ?? e.message, { cause: e });
69
			} else {
70
				error = new Error(toErrorMessage(e, true));
71
			}
72
			this._logService.error(error, `Error fetching available ${GeminiNativeBYOKLMProvider.providerName} models`);
73
			throw error;
74
		}
75
	}
76

77
	async provideLanguageModelChatResponse(model: ExtendedLanguageModelChatInformation<LanguageModelChatConfiguration>, messages: Array<LanguageModelChatMessage | LanguageModelChatMessage2>, options: ProvideLanguageModelChatResponseOptions, progress: Progress<LanguageModelResponsePart2>, token: CancellationToken): Promise<any> {
78
		// Restore CapturingToken context if correlation ID was passed through modelOptions.
79
		// This handles the case where AsyncLocalStorage context was lost crossing VS Code IPC.
80
		const correlationId = (options as { modelOptions?: OTelModelOptions }).modelOptions?._capturingTokenCorrelationId;
81
		const capturingToken = correlationId ? retrieveCapturingTokenByCorrelation(correlationId) : undefined;
82

83
		// Restore OTel trace context to link spans back to the agent trace
84
		const parentTraceContext = (options as { modelOptions?: OTelModelOptions }).modelOptions?._otelTraceContext ?? undefined;
85

86
		// OTel span handle — created outside doRequest, enriched inside with usage data
87
		let otelSpan: ReturnType<typeof this._otelService.startSpan> | undefined;
88

89
		const doRequest = async () => {
90
			const issuedTime = Date.now();
91
			const apiKey = model.configuration?.apiKey;
92
			if (!apiKey) {
93
				throw new Error('API key not found for the model');
94
			}
95

96
			const client = new GoogleGenAI({ apiKey });
97
			// Convert the messages from the API format into messages that we can use against Gemini
98
			const { contents, systemInstruction } = apiMessageToGeminiMessage(messages as LanguageModelChatMessage[]);
99

100
			const requestId = generateUuid();
101
			const pendingLoggedChatRequest = this._requestLogger.logChatRequest(
102
				'GeminiNativeBYOK',
103
				{
104
					model: model.id,
105
					modelMaxPromptTokens: model.maxInputTokens,
106
					urlOrRequestMetadata: 'https://generativelanguage.googleapis.com',
107
				},
108
				{
109
					model: model.id,
110
					messages: geminiMessagesToRawMessagesForLogging(contents, systemInstruction),
111
					ourRequestId: requestId,
112
					location: ChatLocation.Other,
113
					body: {
114
						tools: options.tools?.map((tool): OpenAiFunctionTool => ({
115
							type: 'function',
116
							function: {
117
								name: tool.name,
118
								description: tool.description,
119
								parameters: tool.inputSchema
120
							}
121
						}))
122
					}
123
				});
124

125
			// Convert VS Code tools to Gemini function declarations
126
			const tools: Tool[] = (options.tools ?? []).length > 0 ? [{
127
				functionDeclarations: (options.tools ?? []).map(tool => {
128
					if (!tool.inputSchema) {
129
						return {
130
							name: tool.name,
131
							description: tool.description,
132
							parameters: {
133
								type: Type.OBJECT,
134
								properties: {},
135
								required: []
136
							}
137
						};
138
					}
139

140
					// Transform the input schema to match Gemini's expectations
141
					const finalTool = toGeminiFunctionDeclaration(tool.name, tool.description, tool.inputSchema as ToolJsonSchema);
142
					finalTool.description = tool.description || finalTool.description;
143
					return finalTool;
144
				})
145
			}] : [];
146

147
			// Bridge VS Code cancellation token to Gemini abortSignal for early network termination
148
			const abortController = new AbortController();
149
			const cancelSub = token.onCancellationRequested(() => {
150
				abortController.abort();
151
				this._logService.trace('Gemini request aborted via VS Code cancellation token');
152
			});
153

154
			const params: GenerateContentParameters = {
155
				model: model.id,
156
				contents: contents,
157
				config: {
158
					systemInstruction: systemInstruction,
159
					tools: tools.length > 0 ? tools : undefined,
160
					maxOutputTokens: model.maxOutputTokens,
161
					thinkingConfig: {
162
						includeThoughts: true,
163
					},
164
					abortSignal: abortController.signal
165
				}
166
			};
167

168
			const wrappedProgress = new RecordedProgress(progress);
169

170
			try {
171
				const result = await this._makeRequest(client, wrappedProgress, params, token, issuedTime);
172
				if (result.ttft) {
173
					pendingLoggedChatRequest.markTimeToFirstToken(result.ttft);
174
				}
175
				pendingLoggedChatRequest.resolve({
176
					type: ChatFetchResponseType.Success,
177
					requestId,
178
					serverRequestId: requestId,
179
					usage: result.usage,
180
					resolvedModel: model.id,
181
					value: ['value'],
182
				}, wrappedProgress.items.map((i): IResponseDelta => {
183
					return {
184
						text: i instanceof LanguageModelTextPart ? i.value : '',
185
						copilotToolCalls: i instanceof LanguageModelToolCallPart ? [{
186
							name: i.name,
187
							arguments: JSON.stringify(i.input),
188
							id: i.callId
189
						}] : undefined,
190
					};
191
				}));
192

193
				// Enrich OTel span with usage data from the Gemini response
194
				if (otelSpan && result.usage) {
195
					otelSpan.setAttributes({
196
						[GenAiAttr.USAGE_INPUT_TOKENS]: result.usage.prompt_tokens ?? 0,
197
						[GenAiAttr.USAGE_OUTPUT_TOKENS]: result.usage.completion_tokens ?? 0,
198
						...(result.usage.prompt_tokens_details?.cached_tokens
199
							? { [GenAiAttr.USAGE_CACHE_READ_INPUT_TOKENS]: result.usage.prompt_tokens_details.cached_tokens }
200
							: {}),
201
						[GenAiAttr.RESPONSE_MODEL]: model.id,
202
						[GenAiAttr.RESPONSE_ID]: requestId,
203
						[GenAiAttr.RESPONSE_FINISH_REASONS]: ['stop'],
204
						[GenAiAttr.CONVERSATION_ID]: requestId,
205
						...(result.ttft ? { [CopilotChatAttr.TIME_TO_FIRST_TOKEN]: result.ttft } : {}),
206
						[GenAiAttr.REQUEST_MAX_TOKENS]: model.maxOutputTokens ?? 0,
207
					});
208
					// Opt-in content capture
209
					if (this._otelService.config.captureContent) {
210
						const responseText = wrappedProgress.items
211
							.filter((p): p is LanguageModelTextPart => p instanceof LanguageModelTextPart)
212
							.map(p => p.value).join('');
213
						const toolCalls = wrappedProgress.items
214
							.filter((p): p is LanguageModelToolCallPart => p instanceof LanguageModelToolCallPart)
215
							.map(tc => ({ type: 'tool_call' as const, id: tc.callId, name: tc.name, arguments: tc.input }));
216
						const parts: Array<{ type: string; content?: string; id?: string; name?: string; arguments?: unknown }> = [];
217
						if (responseText) { parts.push({ type: 'text', content: responseText }); }
218
						parts.push(...toolCalls);
219
						if (parts.length > 0) {
220
							otelSpan.setAttribute(GenAiAttr.OUTPUT_MESSAGES, truncateForOTel(JSON.stringify([{ role: 'assistant', parts }])));
221
						}
222
					}
223
				}
224

225
				// Record OTel metrics for this Gemini LLM call
226
				if (result.usage) {
227
					const durationSec = (Date.now() - issuedTime) / 1000;
228
					const metricAttrs = { operationName: GenAiOperationName.CHAT, providerName: 'gemini', requestModel: model.id, responseModel: model.id };
229
					GenAiMetrics.recordOperationDuration(this._otelService, durationSec, metricAttrs);
230
					if (result.usage.prompt_tokens) { GenAiMetrics.recordTokenUsage(this._otelService, result.usage.prompt_tokens, 'input', metricAttrs); }
231
					if (result.usage.completion_tokens) { GenAiMetrics.recordTokenUsage(this._otelService, result.usage.completion_tokens, 'output', metricAttrs); }
232
					if (result.ttft) { GenAiMetrics.recordTimeToFirstToken(this._otelService, model.id, result.ttft / 1000); }
233
				}
234

235
				// Emit OTel inference details event
236
				emitInferenceDetailsEvent(
237
					this._otelService,
238
					{ model: model.id, maxTokens: model.maxOutputTokens },
239
					result.usage ? {
240
						id: requestId,
241
						model: model.id,
242
						finishReasons: ['stop'],
243
						inputTokens: result.usage.prompt_tokens,
244
						outputTokens: result.usage.completion_tokens,
245
					} : undefined,
246
				);
247

248
				// Send success telemetry matching response.success format
249
				/* __GDPR__
250
					"response.success" : {
251
						"owner": "digitarald",
252
						"comment": "Report quality details for a successful service response.",
253
						"reason": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Reason for why a response finished" },
254
						"filterReason": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Reason for why a response was filtered" },
255
						"source": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Source of the initial request" },
256
						"initiatorType": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Whether the request was initiated by a user or an agent" },
257
						"model": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Model selection for the response" },
258
						"modelInvoked": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Actual model invoked for the response" },
259
						"apiType": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "API type for the response- chat completions or responses" },
260
						"requestId": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Id of the current turn request" },
261
						"gitHubRequestId": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "GitHub request id if available" },
262
						"associatedRequestId": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Another request ID that this request is associated with (eg, the originating request of a summarization request)." },
263
						"reasoningEffort": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Reasoning effort level" },
264
						"reasoningSummary": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Reasoning summary level" },
265
						"fetcher": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "The fetcher used for the request" },
266
						"transport": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "The transport used for the request (http or websocket)" },
267
						"totalTokenMax": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Maximum total token window", "isMeasurement": true },
268
						"clientPromptTokenCount": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Number of prompt tokens, locally counted", "isMeasurement": true },
269
						"promptTokenCount": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Number of prompt tokens, server side counted", "isMeasurement": true },
270
						"promptCacheTokenCount": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Number of prompt tokens hitting cache as reported by server", "isMeasurement": true },
271
						"tokenCountMax": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Maximum generated tokens", "isMeasurement": true },
272
						"tokenCount": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Number of generated tokens", "isMeasurement": true },
273
						"reasoningTokens": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Number of reasoning tokens", "isMeasurement": true },
274
						"acceptedPredictionTokens": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Number of tokens in the prediction that appeared in the completion", "isMeasurement": true },
275
						"rejectedPredictionTokens": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Number of tokens in the prediction that appeared in the completion", "isMeasurement": true },
276
						"completionTokens": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Number of tokens in the output", "isMeasurement": true },
277
						"timeToFirstToken": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Time to first token", "isMeasurement": true },
278
						"timeToFirstTokenEmitted": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Time to first token emitted (visible text)", "isMeasurement": true },
279
						"timeToComplete": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Time to complete the request", "isMeasurement": true },
280
						"issuedTime": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Timestamp when the request was issued", "isMeasurement": true },
281
						"isVisionRequest": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether the request was for a vision model", "isMeasurement": true },
282
						"isBYOK": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Whether the request was for a BYOK model", "isMeasurement": true },
283
						"isAuto": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Whether the request was for an Auto model", "isMeasurement": true },
284
						"bytesReceived": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Number of bytes received in the response", "isMeasurement": true },
285
						"retryAfterError": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Error of the original request." },
286
						"retryAfterErrorGitHubRequestId": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "GitHub request id of the original request if available" },
287
						"connectivityTestError": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Error of the connectivity test." },
288
						"connectivityTestErrorGitHubRequestId": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "GitHub request id of the connectivity test request if available" },
289
						"retryAfterFilterCategory": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "If the response was filtered and this is a retry attempt, this contains the original filtered content category." },
290
						"suspendEventSeen": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether a system suspend event was seen during the request", "isMeasurement": true },
291
						"resumeEventSeen": { "classification": "SystemMetaData", "purpose": "PerformanceAndHealth", "comment": "Whether a system resume event was seen during the request", "isMeasurement": true }
292
					}
293
				*/
294
				this._telemetryService.sendTelemetryEvent('response.success', { github: true, microsoft: true }, {
295
					source: 'byok.gemini',
296
					model: model.id,
297
					requestId,
298
				}, {
299
					totalTokenMax: model.maxInputTokens ?? -1,
300
					tokenCountMax: model.maxOutputTokens ?? -1,
301
					promptTokenCount: result.usage?.prompt_tokens,
302
					promptCacheTokenCount: result.usage?.prompt_tokens_details?.cached_tokens,
303
					tokenCount: result.usage?.total_tokens,
304
					completionTokens: result.usage?.completion_tokens,
305
					timeToFirstToken: result.ttft,
306
					timeToFirstTokenEmitted: result.ttfte,
307
					timeToComplete: Date.now() - issuedTime,
308
					issuedTime,
309
					isBYOK: 1,
310
				});
311
			} catch (err) {
312
				this._logService.error(`BYOK GeminiNative error: ${toErrorMessage(err, true)}`);
313
				pendingLoggedChatRequest.resolve({
314
					type: token.isCancellationRequested ? ChatFetchResponseType.Canceled : ChatFetchResponseType.Unknown,
315
					requestId,
316
					serverRequestId: requestId,
317
					reason: token.isCancellationRequested ? 'cancelled' : toErrorMessage(err)
318
				}, wrappedProgress.items.map((i): IResponseDelta => {
319
					return {
320
						text: i instanceof LanguageModelTextPart ? i.value : '',
321
						copilotToolCalls: i instanceof LanguageModelToolCallPart ? [{
322
							name: i.name,
323
							arguments: JSON.stringify(i.input),
324
							id: i.callId
325
						}] : undefined,
326
					};
327
				}));
328
				throw err;
329
			} finally {
330
				cancelSub.dispose();
331
			}
332
		};
333

334
		// Create OTel span and execute with trace context + CapturingToken
335
		const executeRequest = async () => {
336
			otelSpan = this._otelService.startSpan(`chat ${model.id}`, {
337
				kind: SpanKind.CLIENT,
338
				attributes: {
339
					[GenAiAttr.OPERATION_NAME]: GenAiOperationName.CHAT,
340
					[GenAiAttr.PROVIDER_NAME]: GenAiProviderName.GEMINI,
341
					[GenAiAttr.REQUEST_MODEL]: model.id,
342
					[GenAiAttr.AGENT_NAME]: 'GeminiBYOK',
343
					[CopilotChatAttr.MAX_PROMPT_TOKENS]: model.maxInputTokens,
344
					[StdAttr.SERVER_ADDRESS]: 'generativelanguage.googleapis.com',
345
				},
346
			});
347
			// Opt-in: capture input messages in OTel GenAI format
348
			if (this._otelService.config.captureContent) {
349
				// Tool definitions on the chat span (issue #299934) with `parameters`
350
				// per OTel GenAI semantic conventions (issue #300318).
351
				const toolDefs = toToolDefinitions(options.tools);
352
				if (toolDefs) {
353
					otelSpan.setAttribute(GenAiAttr.TOOL_DEFINITIONS, truncateForOTel(JSON.stringify(toolDefs)));
354
				}
355
				try {
356
					const roleNames: Record<number, string> = { 1: 'user', 2: 'assistant', 3: 'system' };
357
					const inputMsgs = messages.map(m => {
358
						const msg = m as LanguageModelChatMessage;
359
						const role = roleNames[msg.role] ?? String(msg.role);
360
						const parts: Array<{ type: string; content?: string; id?: string; name?: string; arguments?: unknown }> = [];
361
						if (Array.isArray(msg.content)) {
362
							for (const p of msg.content) {
363
								if (p instanceof LanguageModelTextPart) {
364
									parts.push({ type: 'text', content: p.value });
365
								} else if (p instanceof LanguageModelToolCallPart) {
366
									parts.push({ type: 'tool_call', id: p.callId, name: p.name, arguments: p.input });
367
								}
368
							}
369
						}
370
						if (parts.length === 0) {
371
							parts.push({ type: 'text', content: '[non-text content]' });
372
						}
373
						return { role, parts };
374
					});
375
					otelSpan.setAttribute(GenAiAttr.INPUT_MESSAGES, truncateForOTel(JSON.stringify(inputMsgs)));
376
				} catch { /* swallow */ }
377
			}
378
			try {
379
				const result = capturingToken
380
					? await runWithCapturingToken(capturingToken, doRequest)
381
					: await doRequest();
382
				otelSpan.setStatus(SpanStatusCode.OK);
383
				return result;
384
			} catch (err) {
385
				otelSpan.setStatus(SpanStatusCode.ERROR, err instanceof Error ? err.message : String(err));
386
				throw err;
387
			} finally {
388
				otelSpan.end();
389
			}
390
		};
391

392
		if (parentTraceContext) {
393
			return this._otelService.runWithTraceContext(parentTraceContext, executeRequest);
394
		}
395
		return executeRequest();
396
	}
397

398
	async provideTokenCount(model: LanguageModelChatInformation, text: string | LanguageModelChatMessage | LanguageModelChatMessage2, token: CancellationToken): Promise<number> {
399
		// Simple estimation for approximate token count - actual token count would require Gemini's tokenizer
400
		return Math.ceil(text.toString().length / 4);
401
	}
402

403
	private async _makeRequest(client: GoogleGenAI, progress: Progress<LMResponsePart>, params: GenerateContentParameters, token: CancellationToken, issuedTime: number): Promise<{ ttft: number | undefined; ttfte: number | undefined; usage: APIUsage | undefined }> {
404
		const start = Date.now();
405
		let ttft: number | undefined;
406
		let ttfte: number | undefined;
407
		let usage: APIUsage | undefined;
408

409
		try {
410
			const stream = await client.models.generateContentStream(params);
411

412
			let pendingThinkingSignature: string | undefined;
413

414
			for await (const chunk of stream) {
415
				if (token.isCancellationRequested) {
416
					break;
417
				}
418

419
				if (ttft === undefined) {
420
					ttft = Date.now() - start;
421
				}
422

423
				this._logService.trace(`Gemini chunk: ${JSON.stringify(chunk)}`);
424

425
				// Process the streaming response chunks
426
				if (chunk.candidates && chunk.candidates.length > 0) {
427
					// choose the primary candidate
428
					const candidate = chunk.candidates[0];
429

430
					if (candidate.content && candidate.content.parts) {
431
						for (const part of candidate.content.parts) {
432
							// First, capture thought signature from this part (if present)
433
							if ('thoughtSignature' in part && part.thoughtSignature) {
434
								pendingThinkingSignature = part.thoughtSignature as string;
435
							}
436
							// Now handle the actual content parts
437
							if ('thought' in part && part.thought === true && part.text) {
438
								// Handle thinking/reasoning content from Gemini API
439
								if (ttfte === undefined) {
440
									ttfte = Date.now() - issuedTime;
441
								}
442
								progress.report(new LanguageModelThinkingPart(part.text));
443
							} else if (part.text) {
444
								if (ttfte === undefined) {
445
									ttfte = Date.now() - issuedTime;
446
								}
447
								progress.report(new LanguageModelTextPart(part.text));
448
							} else if (part.functionCall && part.functionCall.name) {
449
								// Gemini 3 includes thought signatures for function calling
450
								// If we have a pending signature, emit it as a thinking part with metadata.signature
451
								if (pendingThinkingSignature) {
452
									const thinkingPart = new LanguageModelThinkingPart('', undefined, { signature: pendingThinkingSignature });
453
									progress.report(thinkingPart);
454
									pendingThinkingSignature = undefined;
455
								}
456

457
								if (ttfte === undefined) {
458
									ttfte = Date.now() - issuedTime;
459
								}
460
								progress.report(new LanguageModelToolCallPart(
461
									generateUuid(),
462
									part.functionCall.name,
463
									part.functionCall.args || {}
464
								));
465
							}
466
						}
467
					}
468
				}
469

470
				// Extract usage information if available in the chunk
471
				// Initialize on first chunk with usageMetadata, then update incrementally
472
				// This ensures we capture prompt token info even if stream is cancelled mid-way
473
				if (chunk.usageMetadata) {
474
					const promptTokens = chunk.usageMetadata.promptTokenCount;
475
					// For thinking models (e.g., gemini-3-pro-high), candidatesTokenCount only includes
476
					// regular output tokens. thoughtsTokenCount contains the thinking/reasoning tokens.
477
					// We include both in the completion token count.
478
					const candidateTokens = chunk.usageMetadata.candidatesTokenCount ?? 0;
479
					const thoughtTokens = chunk.usageMetadata.thoughtsTokenCount ?? 0;
480
					const completionTokens = candidateTokens + thoughtTokens > 0 ? candidateTokens + thoughtTokens : undefined;
481
					const cachedTokens = chunk.usageMetadata.cachedContentTokenCount;
482

483
					if (!usage) {
484
						// Initialize usage on first chunk - use -1 as sentinel for unavailable values
485
						usage = {
486
							completion_tokens: completionTokens ?? -1,
487
							prompt_tokens: promptTokens ?? -1,
488
							total_tokens: chunk.usageMetadata.totalTokenCount ?? -1,
489
							prompt_tokens_details: {
490
								cached_tokens: cachedTokens ?? 0,
491
							}
492
						};
493
					} else {
494
						// Update with latest values, preserving existing non-sentinel values
495
						if (promptTokens !== undefined) {
496
							usage.prompt_tokens = promptTokens;
497
						}
498
						if (completionTokens !== undefined) {
499
							usage.completion_tokens = completionTokens;
500
						}
501
						if (chunk.usageMetadata.totalTokenCount !== undefined) {
502
							usage.total_tokens = chunk.usageMetadata.totalTokenCount;
503
						} else if (usage.prompt_tokens !== -1 && usage.completion_tokens !== -1) {
504
							usage.total_tokens = usage.prompt_tokens + usage.completion_tokens;
505
						}
506
						if (cachedTokens !== undefined) {
507
							usage.prompt_tokens_details!.cached_tokens = cachedTokens;
508
						}
509
					}
510
				}
511
			}
512

513
			return { ttft, ttfte, usage };
514
		} catch (error) {
515
			if ((error as any)?.name === 'AbortError' || token.isCancellationRequested) {
516
				this._logService.trace('Gemini streaming aborted');
517
				// Return partial usage data collected before cancellation
518
				return { ttft, ttfte, usage };
519
			}
520
			this._logService.error(`Gemini streaming error: ${toErrorMessage(error, true)}`);
521
			throw error;
522
		}
523
	}
524
}
525

526
Product

Resources

Company