Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/platform/endpoint/node/chatEndpoint.ts
13401 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
import { RequestMetadata, RequestType } from '@vscode/copilot-api';
6
import { OpenAI, Raw } from '@vscode/prompt-tsx';
7
import type { CancellationToken } from 'vscode';
8
import { ITokenizer, TokenizerType } from '../../../util/common/tokenizer';
9
import { AsyncIterableObject } from '../../../util/vs/base/common/async';
10
import { deepClone, mixin } from '../../../util/vs/base/common/objects';
11
import { generateUuid } from '../../../util/vs/base/common/uuid';
12
import { IInstantiationService } from '../../../util/vs/platform/instantiation/common/instantiation';
13
import { IAuthenticationService } from '../../authentication/common/authentication';
14
import { IChatMLFetcher, Source } from '../../chat/common/chatMLFetcher';
15
import { ChatFetchResponseType, ChatLocation, ChatResponse } from '../../chat/common/commonTypes';
16
import { getTextPart } from '../../chat/common/globalStringUtils';
17
import { CHAT_MODEL, ConfigKey, IConfigurationService } from '../../configuration/common/configurationService';
18
import { ILogService } from '../../log/common/logService';
19
import { isAnthropicContextEditingEnabled } from '../../networking/common/anthropic';
20
import { FinishedCallback, getRequestId, ICopilotToolCall, OptionalChatRequestParams } from '../../networking/common/fetch';
21
import { IFetcherService, Response } from '../../networking/common/fetcherService';
22
import { createCapiRequestBody, IChatEndpoint, IChatEndpointTokenPricing, ICreateEndpointBodyOptions, IEndpointBody, IMakeChatRequestOptions } from '../../networking/common/networking';
23
import { CAPIChatMessage, ChatCompletion, FinishedCompletionReason, RawMessageConversionCallback } from '../../networking/common/openai';
24
import { prepareChatCompletionForReturn } from '../../networking/node/chatStream';
25
import { IChatWebSocketManager } from '../../networking/node/chatWebSocketManager';
26
import { SSEProcessor } from '../../networking/node/stream';
27
import { IExperimentationService } from '../../telemetry/common/nullExperimentationService';
28
import { ITelemetryService, TelemetryProperties } from '../../telemetry/common/telemetry';
29
import { TelemetryData } from '../../telemetry/common/telemetryData';
30
import { ITokenizerProvider } from '../../tokenizer/node/tokenizer';
31
import { ICAPIClientService } from '../common/capiClient';
32
import { isAnthropicFamily, isGeminiFamily, modelSupportsContextEditing, modelSupportsToolSearch } from '../common/chatModelCapabilities';
33
import { IDomainService } from '../common/domainService';
34
import { CustomModel, IChatModelInformation, IModelTokenPrices, ModelSupportedEndpoint } from '../common/endpointProvider';
35
import { createMessagesRequestBody, processResponseFromMessagesEndpoint } from './messagesApi';
36
import { createResponsesRequestBody, getResponsesApiCompactionThreshold, processResponseFromChatEndpoint } from './responsesApi';
37
import { filterHistoryImages } from './imageLimits';
38
39
/**
40
* The default processor for the stream format from CAPI
41
*/
42
export async function defaultChatResponseProcessor(
43
telemetryService: ITelemetryService,
44
logService: ILogService,
45
response: Response,
46
expectedNumChoices: number,
47
finishCallback: FinishedCallback,
48
telemetryData: TelemetryData,
49
cancellationToken?: CancellationToken | undefined
50
) {
51
const processor = await SSEProcessor.create(logService, telemetryService, expectedNumChoices, response, cancellationToken);
52
const finishedCompletions = processor.processSSE(finishCallback);
53
const chatCompletions = AsyncIterableObject.map(finishedCompletions, (solution) => {
54
const loggedReason = solution.reason ?? 'client-trimmed';
55
const dataToSendToTelemetry = telemetryData.extendedBy({
56
completionChoiceFinishReason: loggedReason,
57
headerRequestId: solution.requestId.headerRequestId
58
});
59
telemetryService.sendGHTelemetryEvent('completion.finishReason', dataToSendToTelemetry.properties, dataToSendToTelemetry.measurements);
60
return prepareChatCompletionForReturn(telemetryService, logService, solution, telemetryData);
61
});
62
return chatCompletions;
63
}
64
65
export async function defaultNonStreamChatResponseProcessor(response: Response, finishCallback: FinishedCallback, telemetryData: TelemetryData) {
66
const textResponse = await response.text();
67
const jsonResponse = JSON.parse(textResponse);
68
const completions: ChatCompletion[] = [];
69
for (let i = 0; i < (jsonResponse?.choices?.length || 0); i++) {
70
const choice = jsonResponse.choices[i];
71
const message: Raw.AssistantChatMessage = {
72
role: choice.message.role,
73
content: choice.message.content,
74
name: choice.message.name,
75
// Normalize property name: OpenAI API uses snake_case (tool_calls) but our types expect camelCase (toolCalls)
76
// See: https://platform.openai.com/docs/api-reference/chat/object#chat-object-choices-message-tool_calls
77
toolCalls: choice.message.toolCalls ?? choice.message.tool_calls,
78
};
79
const messageText = getTextPart(message.content);
80
const requestId = response.headers.get('X-Request-ID') ?? generateUuid();
81
const ghRequestId = response.headers.get('x-github-request-id') ?? '';
82
const { serverExperiments } = getRequestId(response.headers);
83
84
85
const completion: ChatCompletion = {
86
blockFinished: false,
87
choiceIndex: i,
88
model: jsonResponse.model,
89
filterReason: undefined,
90
finishReason: choice.finish_reason as FinishedCompletionReason,
91
message: message,
92
usage: jsonResponse.usage,
93
tokens: [], // This is used for repetition detection so not super important to be accurate
94
requestId: { headerRequestId: requestId, gitHubRequestId: ghRequestId, completionId: jsonResponse.id, created: jsonResponse.created, deploymentId: '', serverExperiments },
95
telemetryData: telemetryData
96
};
97
const functionCall: ICopilotToolCall[] = [];
98
for (const tool of message.toolCalls ?? []) {
99
functionCall.push({
100
name: tool.function?.name ?? '',
101
arguments: tool.function?.arguments ?? '',
102
id: tool.id ?? '',
103
});
104
}
105
await finishCallback(messageText, i, {
106
text: messageText,
107
copilotToolCalls: functionCall,
108
});
109
completions.push(completion);
110
}
111
112
return AsyncIterableObject.fromArray(completions);
113
}
114
115
const AIC_DIVISOR = 1_000_000_000;
116
const TOKENS_PER_MILLION = 1_000_000;
117
118
/**
119
* Converts raw billing token prices into normalized AICs per million tokens.
120
*
121
* Raw prices are divided by {@link AIC_DIVISOR} to get AICs, then scaled
122
* so the result is always "per 1M tokens" regardless of the original batch_size.
123
*/
124
function normalizeTokenPricing(tokenPrices: IModelTokenPrices | undefined): IChatEndpointTokenPricing | undefined {
125
if (!tokenPrices) {
126
return undefined;
127
}
128
const { batch_size, input_price, output_price, cache_price } = tokenPrices;
129
const scale = TOKENS_PER_MILLION / batch_size;
130
return {
131
inputPrice: (input_price / AIC_DIVISOR) * scale,
132
outputPrice: (output_price / AIC_DIVISOR) * scale,
133
cacheReadTokenPrice: (cache_price / AIC_DIVISOR) * scale,
134
};
135
}
136
137
export class ChatEndpoint implements IChatEndpoint {
138
private readonly _maxTokens: number;
139
private readonly _maxOutputTokens: number;
140
public readonly model: string;
141
public readonly name: string;
142
public readonly version: string;
143
public readonly modelProvider: string;
144
public readonly family: string;
145
public readonly tokenizer: TokenizerType;
146
public readonly showInModelPicker: boolean;
147
public readonly isFallback: boolean;
148
public readonly supportsToolCalls: boolean;
149
public readonly supportsVision: boolean;
150
public readonly supportsPrediction: boolean;
151
public readonly supportsAdaptiveThinking?: boolean;
152
public readonly minThinkingBudget?: number;
153
public readonly maxThinkingBudget?: number;
154
public readonly supportsReasoningEffort?: string[];
155
public readonly supportsToolSearch?: boolean;
156
public readonly supportsContextEditing?: boolean;
157
public readonly isPremium?: boolean | undefined;
158
public readonly multiplier?: number | undefined;
159
public readonly restrictedToSkus?: string[] | undefined;
160
public readonly tokenPricing?: IChatEndpointTokenPricing | undefined;
161
public readonly customModel?: CustomModel | undefined;
162
public readonly maxPromptImages?: number | undefined;
163
164
private readonly _supportsStreaming: boolean;
165
166
constructor(
167
public readonly modelMetadata: IChatModelInformation,
168
@IDomainService protected readonly _domainService: IDomainService,
169
@IChatMLFetcher private readonly _chatMLFetcher: IChatMLFetcher,
170
@ITokenizerProvider private readonly _tokenizerProvider: ITokenizerProvider,
171
@IInstantiationService protected readonly _instantiationService: IInstantiationService,
172
@IConfigurationService protected readonly _configurationService: IConfigurationService,
173
@IExperimentationService private readonly _expService: IExperimentationService,
174
@IChatWebSocketManager private readonly _chatWebSocketService: IChatWebSocketManager,
175
@ILogService _logService: ILogService,
176
) {
177
// This metadata should always be present, but if not we will default to 8192 tokens
178
this._maxTokens = modelMetadata.capabilities.limits?.max_prompt_tokens ?? 8192;
179
// This metadata should always be present, but if not we will default to 4096 tokens
180
this._maxOutputTokens = modelMetadata.capabilities.limits?.max_output_tokens ?? 4096;
181
this.model = modelMetadata.id;
182
this.modelProvider = modelMetadata.vendor;
183
this.name = modelMetadata.name;
184
this.version = modelMetadata.version;
185
this.family = modelMetadata.capabilities.family;
186
this.tokenizer = modelMetadata.capabilities.tokenizer;
187
this.showInModelPicker = modelMetadata.model_picker_enabled;
188
this.isPremium = modelMetadata.billing?.is_premium;
189
this.multiplier = modelMetadata.billing?.multiplier;
190
this.restrictedToSkus = modelMetadata.billing?.restricted_to;
191
this.tokenPricing = normalizeTokenPricing(modelMetadata.billing?.token_prices);
192
this.isFallback = modelMetadata.is_chat_fallback;
193
this.supportsToolCalls = !!modelMetadata.capabilities.supports.tool_calls;
194
this.supportsVision = !!modelMetadata.capabilities.supports.vision;
195
this.supportsPrediction = !!modelMetadata.capabilities.supports.prediction;
196
this.supportsAdaptiveThinking = modelMetadata.capabilities.supports.adaptive_thinking;
197
this.minThinkingBudget = modelMetadata.capabilities.supports.min_thinking_budget;
198
this.maxThinkingBudget = modelMetadata.capabilities.supports.max_thinking_budget;
199
this.supportsReasoningEffort = modelMetadata.capabilities.supports.reasoning_effort;
200
this.supportsToolSearch = modelMetadata.capabilities.supports.tool_search ?? modelSupportsToolSearch(this.model, this._configurationService, this._expService);
201
this.supportsContextEditing = modelMetadata.capabilities.supports.context_editing ?? modelSupportsContextEditing(this.model);
202
this._supportsStreaming = !!modelMetadata.capabilities.supports.streaming;
203
this.customModel = modelMetadata.custom_model;
204
this.maxPromptImages = modelMetadata.capabilities.limits?.vision?.max_prompt_images;
205
}
206
207
// TODO: Thread enableThinking through the fetch pipeline (INetworkRequestOptions / chatMLFetcher positional params)
208
// so getExtraHeaders can gate the interleaved-thinking header on whether thinking is actually enabled for the
209
// request, rather than using the location check. Once plumbed, replace isAllowedConversationAgentModel with
210
// an enableThinking check for the thinking header (keep location gate for context management / tool search).
211
public getExtraHeaders(_location?: ChatLocation): Record<string, string> {
212
const headers: Record<string, string> = { ...this.modelMetadata.requestHeaders };
213
214
if (this.useMessagesApi) {
215
216
const modelProviderPreference = this._configurationService.getConfig(ConfigKey.TeamInternal.ModelProviderPreference);
217
if (modelProviderPreference) {
218
headers['X-Model-Provider-Preference'] = modelProviderPreference;
219
}
220
221
const betas: string[] = [];
222
223
if (!this.supportsAdaptiveThinking) {
224
betas.push('interleaved-thinking-2025-05-14');
225
}
226
if (this.supportsToolSearch) {
227
betas.push('advanced-tool-use-2025-11-20');
228
}
229
if (isAnthropicContextEditingEnabled(this, this._configurationService, this._expService)) {
230
betas.push('context-management-2025-06-27');
231
}
232
if (betas.length > 0) {
233
headers['anthropic-beta'] = betas.join(',');
234
}
235
}
236
237
return headers;
238
}
239
240
public get modelMaxPromptTokens(): number {
241
return this._maxTokens;
242
}
243
244
public get maxOutputTokens(): number {
245
return this._maxOutputTokens;
246
}
247
248
public get urlOrRequestMetadata(): string | RequestMetadata {
249
// Use override or respect setting.
250
// TODO unlikely but would break if it changes in the middle of a request being constructed
251
return this.modelMetadata.urlOrRequestMetadata ??
252
(this.useResponsesApi ? { type: RequestType.ChatResponses } :
253
this.useMessagesApi ? { type: RequestType.ChatMessages } : { type: RequestType.ChatCompletions });
254
}
255
256
protected get useResponsesApi(): boolean {
257
if (this.modelMetadata.supported_endpoints
258
&& !this.modelMetadata.supported_endpoints.includes(ModelSupportedEndpoint.ChatCompletions)
259
&& this.modelMetadata.supported_endpoints.includes(ModelSupportedEndpoint.Responses)
260
) {
261
return true;
262
}
263
264
return !!this.modelMetadata.supported_endpoints?.includes(ModelSupportedEndpoint.Responses);
265
}
266
267
protected get useWebSocketResponsesApi(): boolean {
268
return !!this.modelMetadata.supported_endpoints?.includes(ModelSupportedEndpoint.WebSocketResponses);
269
}
270
271
protected get useMessagesApi(): boolean {
272
const enableMessagesApi = this._configurationService.getExperimentBasedConfig(ConfigKey.UseAnthropicMessagesApi, this._expService);
273
return !!(enableMessagesApi && this.modelMetadata.supported_endpoints?.includes(ModelSupportedEndpoint.Messages));
274
}
275
276
public get degradationReason(): string | undefined {
277
return this.modelMetadata.warning_messages?.at(0)?.message ?? this.modelMetadata.info_messages?.at(0)?.message;
278
}
279
280
public get apiType(): string {
281
return this.useResponsesApi ? 'responses' :
282
this.useMessagesApi ? 'messages' : 'chatCompletions';
283
}
284
285
interceptBody(body: IEndpointBody | undefined): void {
286
// Remove tool calls from requests that don't support them
287
// We really shouldn't make requests to models that don't support tool calls with tools though
288
if (body && !this.supportsToolCalls) {
289
delete body['tools'];
290
}
291
292
// If the model doesn't support streaming, don't ask for a streamed request
293
if (body && !this._supportsStreaming) {
294
body.stream = false;
295
}
296
297
// If it's o1 we must modify the body significantly as the request is very different
298
if (body?.messages && (this.family.startsWith('o1') || this.model === CHAT_MODEL.O1 || this.model === CHAT_MODEL.O1MINI)) {
299
const newMessages: CAPIChatMessage[] = body.messages.map((message: CAPIChatMessage): CAPIChatMessage => {
300
if (message.role === OpenAI.ChatRole.System) {
301
return {
302
role: OpenAI.ChatRole.User,
303
content: message.content,
304
};
305
} else {
306
return message;
307
}
308
});
309
// Add the messages & model back
310
body['messages'] = newMessages;
311
}
312
}
313
314
createRequestBody(options: ICreateEndpointBodyOptions): IEndpointBody {
315
// Determine per-model image limit for APIs with known restrictions
316
const imageLimit = this.getImageLimit();
317
if (imageLimit !== undefined) {
318
options = { ...options, messages: this.validateAndFilterImages(options.messages, imageLimit) };
319
}
320
321
if (this.useResponsesApi) {
322
const body = this._instantiationService.invokeFunction(createResponsesRequestBody, options, this.model, this);
323
return this.customizeResponsesBody(body);
324
} else if (this.useMessagesApi) {
325
const body = this._instantiationService.invokeFunction(createMessagesRequestBody, options, this.model, this);
326
return this.customizeMessagesBody(body);
327
} else {
328
const body = createCapiRequestBody(options, this.model, this.getCompletionsCallback());
329
return this.customizeCapiBody(body, options);
330
}
331
}
332
333
/**
334
* Returns the model-specific image limit, or `undefined` if no limit applies.
335
* Anthropic Messages API allows up to 20 images per request; Gemini allows up to 10.
336
* These are hardcoded based on API documentation rather than model metadata to
337
* avoid being clamped by unreliable server-provided values.
338
*/
339
private getImageLimit(): number | undefined {
340
if (this.useMessagesApi && isAnthropicFamily(this)) {
341
return 20;
342
}
343
if (isGeminiFamily(this)) {
344
return 10;
345
}
346
return undefined;
347
}
348
349
/**
350
* Thin wrapper around {@link filterHistoryImages} retained for test ergonomics.
351
*/
352
private validateAndFilterImages(messages: Raw.ChatMessage[], maxImages: number): Raw.ChatMessage[] {
353
return filterHistoryImages(messages, maxImages);
354
}
355
356
protected getCompletionsCallback(): RawMessageConversionCallback | undefined {
357
return undefined;
358
}
359
360
protected customizeMessagesBody(body: IEndpointBody): IEndpointBody {
361
return body;
362
}
363
364
protected customizeResponsesBody(body: IEndpointBody): IEndpointBody {
365
return body;
366
}
367
368
protected customizeCapiBody(body: IEndpointBody, options: ICreateEndpointBodyOptions): IEndpointBody {
369
370
// Apply Gemini function calling mode if configured
371
const hasTools = !!options.requestOptions?.tools?.length;
372
if (hasTools && this.family.toLowerCase().includes('gemini-3')) {
373
const geminiFunctionCallingMode = this._configurationService.getExperimentBasedConfig(
374
ConfigKey.TeamInternal.GeminiFunctionCallingMode,
375
this._expService
376
);
377
// Only override tool_choice if experiment provides a value and user hasn't specified a function call
378
if (geminiFunctionCallingMode && typeof body.tool_choice !== 'object') {
379
body.tool_choice = geminiFunctionCallingMode;
380
}
381
}
382
383
return body;
384
}
385
386
public async processResponseFromChatEndpoint(
387
telemetryService: ITelemetryService,
388
logService: ILogService,
389
response: Response,
390
expectedNumChoices: number,
391
finishCallback: FinishedCallback,
392
telemetryData: TelemetryData,
393
cancellationToken?: CancellationToken | undefined
394
): Promise<AsyncIterableObject<ChatCompletion>> {
395
if (this.useResponsesApi) {
396
const compactionThreshold = getResponsesApiCompactionThreshold(this._configurationService, this._expService, this);
397
return processResponseFromChatEndpoint(this._instantiationService, telemetryService, logService, response, expectedNumChoices, finishCallback, telemetryData, compactionThreshold);
398
} else if (this.useMessagesApi) {
399
return processResponseFromMessagesEndpoint(this._instantiationService, telemetryService, logService, response, finishCallback, telemetryData);
400
} else if (!this._supportsStreaming) {
401
return defaultNonStreamChatResponseProcessor(response, finishCallback, telemetryData);
402
} else {
403
return defaultChatResponseProcessor(telemetryService, logService, response, expectedNumChoices, finishCallback, telemetryData, cancellationToken);
404
}
405
}
406
407
public acquireTokenizer(): ITokenizer {
408
return this._tokenizerProvider.acquireTokenizer(this);
409
}
410
411
public async makeChatRequest2(options: IMakeChatRequestOptions, token: CancellationToken): Promise<ChatResponse> {
412
const useWebSocket = options.useWebSocket ?? !!(
413
options.turnId
414
&& options.conversationId
415
&& this.useWebSocketResponsesApi
416
&& this._configurationService.getExperimentBasedConfig(ConfigKey.TeamInternal.ResponsesApiWebSocketEnabled, this._expService)
417
);
418
const ignoreStatefulMarker = options.ignoreStatefulMarker ?? !(
419
useWebSocket
420
&& options.conversationId
421
&& options.turnId
422
&& this._chatWebSocketService.hasActiveConnection(options.conversationId)
423
);
424
const response = await this._makeChatRequest2({
425
...options,
426
useWebSocket,
427
ignoreStatefulMarker,
428
}, token);
429
if (response.type === ChatFetchResponseType.InvalidStatefulMarker) {
430
return this._makeChatRequest2({
431
...options,
432
useWebSocket,
433
ignoreStatefulMarker: true
434
}, token);
435
}
436
return response;
437
}
438
439
protected async _makeChatRequest2(options: IMakeChatRequestOptions, token: CancellationToken) {
440
return this._chatMLFetcher.fetchOne({
441
requestOptions: {},
442
...options,
443
endpoint: this,
444
}, token);
445
}
446
447
public async makeChatRequest(
448
debugName: string,
449
messages: Raw.ChatMessage[],
450
finishedCb: FinishedCallback | undefined,
451
token: CancellationToken,
452
location: ChatLocation,
453
source?: Source,
454
requestOptions?: Omit<OptionalChatRequestParams, 'n'>,
455
userInitiatedRequest?: boolean,
456
telemetryProperties?: TelemetryProperties,
457
): Promise<ChatResponse> {
458
return this.makeChatRequest2({
459
debugName,
460
messages,
461
finishedCb,
462
location,
463
source,
464
requestOptions,
465
userInitiatedRequest,
466
telemetryProperties,
467
}, token);
468
}
469
470
public cloneWithTokenOverride(modelMaxPromptTokens: number): IChatEndpoint {
471
return this._instantiationService.createInstance(
472
ChatEndpoint,
473
mixin(deepClone(this.modelMetadata), { capabilities: { limits: { max_prompt_tokens: modelMaxPromptTokens } } }));
474
}
475
}
476
477
export class RemoteAgentChatEndpoint extends ChatEndpoint {
478
constructor(
479
modelMetadata: IChatModelInformation,
480
private readonly _requestMetadata: RequestMetadata,
481
@IDomainService domainService: IDomainService,
482
@ICAPIClientService capiClientService: ICAPIClientService,
483
@IFetcherService fetcherService: IFetcherService,
484
@ITelemetryService telemetryService: ITelemetryService,
485
@IAuthenticationService authService: IAuthenticationService,
486
@IChatMLFetcher chatMLFetcher: IChatMLFetcher,
487
@ITokenizerProvider tokenizerProvider: ITokenizerProvider,
488
@IInstantiationService instantiationService: IInstantiationService,
489
@IConfigurationService configService: IConfigurationService,
490
@IExperimentationService experimentService: IExperimentationService,
491
@IChatWebSocketManager chatWebSocketService: IChatWebSocketManager,
492
@ILogService logService: ILogService
493
) {
494
super(
495
modelMetadata,
496
domainService,
497
chatMLFetcher,
498
tokenizerProvider,
499
instantiationService,
500
configService,
501
experimentService,
502
chatWebSocketService,
503
logService
504
);
505
}
506
507
override processResponseFromChatEndpoint(
508
telemetryService: ITelemetryService,
509
logService: ILogService,
510
response: Response,
511
expectedNumChoices: number,
512
finishCallback: FinishedCallback,
513
telemetryData: TelemetryData,
514
cancellationToken?: CancellationToken | undefined,
515
_location?: ChatLocation,
516
): Promise<AsyncIterableObject<ChatCompletion>> {
517
// We must override this to a num choices > 1 because remote agents can do internal function calls which emit multiple completions even when N > 1
518
// It's awful that they do this, but we have to support it
519
return defaultChatResponseProcessor(telemetryService, logService, response, 2, finishCallback, telemetryData, cancellationToken);
520
}
521
522
public override get urlOrRequestMetadata() {
523
return this._requestMetadata;
524
}
525
}
526
527