Path: blob/main/extensions/copilot/src/platform/endpoint/node/chatEndpoint.ts
13401 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/4import { RequestMetadata, RequestType } from '@vscode/copilot-api';5import { OpenAI, Raw } from '@vscode/prompt-tsx';6import type { CancellationToken } from 'vscode';7import { ITokenizer, TokenizerType } from '../../../util/common/tokenizer';8import { AsyncIterableObject } from '../../../util/vs/base/common/async';9import { deepClone, mixin } from '../../../util/vs/base/common/objects';10import { generateUuid } from '../../../util/vs/base/common/uuid';11import { IInstantiationService } from '../../../util/vs/platform/instantiation/common/instantiation';12import { IAuthenticationService } from '../../authentication/common/authentication';13import { IChatMLFetcher, Source } from '../../chat/common/chatMLFetcher';14import { ChatFetchResponseType, ChatLocation, ChatResponse } from '../../chat/common/commonTypes';15import { getTextPart } from '../../chat/common/globalStringUtils';16import { CHAT_MODEL, ConfigKey, IConfigurationService } from '../../configuration/common/configurationService';17import { ILogService } from '../../log/common/logService';18import { isAnthropicContextEditingEnabled } from '../../networking/common/anthropic';19import { FinishedCallback, getRequestId, ICopilotToolCall, OptionalChatRequestParams } from '../../networking/common/fetch';20import { IFetcherService, Response } from '../../networking/common/fetcherService';21import { createCapiRequestBody, IChatEndpoint, IChatEndpointTokenPricing, ICreateEndpointBodyOptions, IEndpointBody, IMakeChatRequestOptions } from '../../networking/common/networking';22import { CAPIChatMessage, ChatCompletion, FinishedCompletionReason, RawMessageConversionCallback } from '../../networking/common/openai';23import { prepareChatCompletionForReturn } from '../../networking/node/chatStream';24import { IChatWebSocketManager } from '../../networking/node/chatWebSocketManager';25import { SSEProcessor } from '../../networking/node/stream';26import { IExperimentationService } from '../../telemetry/common/nullExperimentationService';27import { ITelemetryService, TelemetryProperties } from '../../telemetry/common/telemetry';28import { TelemetryData } from '../../telemetry/common/telemetryData';29import { ITokenizerProvider } from '../../tokenizer/node/tokenizer';30import { ICAPIClientService } from '../common/capiClient';31import { isAnthropicFamily, isGeminiFamily, modelSupportsContextEditing, modelSupportsToolSearch } from '../common/chatModelCapabilities';32import { IDomainService } from '../common/domainService';33import { CustomModel, IChatModelInformation, IModelTokenPrices, ModelSupportedEndpoint } from '../common/endpointProvider';34import { createMessagesRequestBody, processResponseFromMessagesEndpoint } from './messagesApi';35import { createResponsesRequestBody, getResponsesApiCompactionThreshold, processResponseFromChatEndpoint } from './responsesApi';36import { filterHistoryImages } from './imageLimits';3738/**39* The default processor for the stream format from CAPI40*/41export async function defaultChatResponseProcessor(42telemetryService: ITelemetryService,43logService: ILogService,44response: Response,45expectedNumChoices: number,46finishCallback: FinishedCallback,47telemetryData: TelemetryData,48cancellationToken?: CancellationToken | undefined49) {50const processor = await SSEProcessor.create(logService, telemetryService, expectedNumChoices, response, cancellationToken);51const finishedCompletions = processor.processSSE(finishCallback);52const chatCompletions = AsyncIterableObject.map(finishedCompletions, (solution) => {53const loggedReason = solution.reason ?? 'client-trimmed';54const dataToSendToTelemetry = telemetryData.extendedBy({55completionChoiceFinishReason: loggedReason,56headerRequestId: solution.requestId.headerRequestId57});58telemetryService.sendGHTelemetryEvent('completion.finishReason', dataToSendToTelemetry.properties, dataToSendToTelemetry.measurements);59return prepareChatCompletionForReturn(telemetryService, logService, solution, telemetryData);60});61return chatCompletions;62}6364export async function defaultNonStreamChatResponseProcessor(response: Response, finishCallback: FinishedCallback, telemetryData: TelemetryData) {65const textResponse = await response.text();66const jsonResponse = JSON.parse(textResponse);67const completions: ChatCompletion[] = [];68for (let i = 0; i < (jsonResponse?.choices?.length || 0); i++) {69const choice = jsonResponse.choices[i];70const message: Raw.AssistantChatMessage = {71role: choice.message.role,72content: choice.message.content,73name: choice.message.name,74// Normalize property name: OpenAI API uses snake_case (tool_calls) but our types expect camelCase (toolCalls)75// See: https://platform.openai.com/docs/api-reference/chat/object#chat-object-choices-message-tool_calls76toolCalls: choice.message.toolCalls ?? choice.message.tool_calls,77};78const messageText = getTextPart(message.content);79const requestId = response.headers.get('X-Request-ID') ?? generateUuid();80const ghRequestId = response.headers.get('x-github-request-id') ?? '';81const { serverExperiments } = getRequestId(response.headers);828384const completion: ChatCompletion = {85blockFinished: false,86choiceIndex: i,87model: jsonResponse.model,88filterReason: undefined,89finishReason: choice.finish_reason as FinishedCompletionReason,90message: message,91usage: jsonResponse.usage,92tokens: [], // This is used for repetition detection so not super important to be accurate93requestId: { headerRequestId: requestId, gitHubRequestId: ghRequestId, completionId: jsonResponse.id, created: jsonResponse.created, deploymentId: '', serverExperiments },94telemetryData: telemetryData95};96const functionCall: ICopilotToolCall[] = [];97for (const tool of message.toolCalls ?? []) {98functionCall.push({99name: tool.function?.name ?? '',100arguments: tool.function?.arguments ?? '',101id: tool.id ?? '',102});103}104await finishCallback(messageText, i, {105text: messageText,106copilotToolCalls: functionCall,107});108completions.push(completion);109}110111return AsyncIterableObject.fromArray(completions);112}113114const AIC_DIVISOR = 1_000_000_000;115const TOKENS_PER_MILLION = 1_000_000;116117/**118* Converts raw billing token prices into normalized AICs per million tokens.119*120* Raw prices are divided by {@link AIC_DIVISOR} to get AICs, then scaled121* so the result is always "per 1M tokens" regardless of the original batch_size.122*/123function normalizeTokenPricing(tokenPrices: IModelTokenPrices | undefined): IChatEndpointTokenPricing | undefined {124if (!tokenPrices) {125return undefined;126}127const { batch_size, input_price, output_price, cache_price } = tokenPrices;128const scale = TOKENS_PER_MILLION / batch_size;129return {130inputPrice: (input_price / AIC_DIVISOR) * scale,131outputPrice: (output_price / AIC_DIVISOR) * scale,132cacheReadTokenPrice: (cache_price / AIC_DIVISOR) * scale,133};134}135136export class ChatEndpoint implements IChatEndpoint {137private readonly _maxTokens: number;138private readonly _maxOutputTokens: number;139public readonly model: string;140public readonly name: string;141public readonly version: string;142public readonly modelProvider: string;143public readonly family: string;144public readonly tokenizer: TokenizerType;145public readonly showInModelPicker: boolean;146public readonly isFallback: boolean;147public readonly supportsToolCalls: boolean;148public readonly supportsVision: boolean;149public readonly supportsPrediction: boolean;150public readonly supportsAdaptiveThinking?: boolean;151public readonly minThinkingBudget?: number;152public readonly maxThinkingBudget?: number;153public readonly supportsReasoningEffort?: string[];154public readonly supportsToolSearch?: boolean;155public readonly supportsContextEditing?: boolean;156public readonly isPremium?: boolean | undefined;157public readonly multiplier?: number | undefined;158public readonly restrictedToSkus?: string[] | undefined;159public readonly tokenPricing?: IChatEndpointTokenPricing | undefined;160public readonly customModel?: CustomModel | undefined;161public readonly maxPromptImages?: number | undefined;162163private readonly _supportsStreaming: boolean;164165constructor(166public readonly modelMetadata: IChatModelInformation,167@IDomainService protected readonly _domainService: IDomainService,168@IChatMLFetcher private readonly _chatMLFetcher: IChatMLFetcher,169@ITokenizerProvider private readonly _tokenizerProvider: ITokenizerProvider,170@IInstantiationService protected readonly _instantiationService: IInstantiationService,171@IConfigurationService protected readonly _configurationService: IConfigurationService,172@IExperimentationService private readonly _expService: IExperimentationService,173@IChatWebSocketManager private readonly _chatWebSocketService: IChatWebSocketManager,174@ILogService _logService: ILogService,175) {176// This metadata should always be present, but if not we will default to 8192 tokens177this._maxTokens = modelMetadata.capabilities.limits?.max_prompt_tokens ?? 8192;178// This metadata should always be present, but if not we will default to 4096 tokens179this._maxOutputTokens = modelMetadata.capabilities.limits?.max_output_tokens ?? 4096;180this.model = modelMetadata.id;181this.modelProvider = modelMetadata.vendor;182this.name = modelMetadata.name;183this.version = modelMetadata.version;184this.family = modelMetadata.capabilities.family;185this.tokenizer = modelMetadata.capabilities.tokenizer;186this.showInModelPicker = modelMetadata.model_picker_enabled;187this.isPremium = modelMetadata.billing?.is_premium;188this.multiplier = modelMetadata.billing?.multiplier;189this.restrictedToSkus = modelMetadata.billing?.restricted_to;190this.tokenPricing = normalizeTokenPricing(modelMetadata.billing?.token_prices);191this.isFallback = modelMetadata.is_chat_fallback;192this.supportsToolCalls = !!modelMetadata.capabilities.supports.tool_calls;193this.supportsVision = !!modelMetadata.capabilities.supports.vision;194this.supportsPrediction = !!modelMetadata.capabilities.supports.prediction;195this.supportsAdaptiveThinking = modelMetadata.capabilities.supports.adaptive_thinking;196this.minThinkingBudget = modelMetadata.capabilities.supports.min_thinking_budget;197this.maxThinkingBudget = modelMetadata.capabilities.supports.max_thinking_budget;198this.supportsReasoningEffort = modelMetadata.capabilities.supports.reasoning_effort;199this.supportsToolSearch = modelMetadata.capabilities.supports.tool_search ?? modelSupportsToolSearch(this.model, this._configurationService, this._expService);200this.supportsContextEditing = modelMetadata.capabilities.supports.context_editing ?? modelSupportsContextEditing(this.model);201this._supportsStreaming = !!modelMetadata.capabilities.supports.streaming;202this.customModel = modelMetadata.custom_model;203this.maxPromptImages = modelMetadata.capabilities.limits?.vision?.max_prompt_images;204}205206// TODO: Thread enableThinking through the fetch pipeline (INetworkRequestOptions / chatMLFetcher positional params)207// so getExtraHeaders can gate the interleaved-thinking header on whether thinking is actually enabled for the208// request, rather than using the location check. Once plumbed, replace isAllowedConversationAgentModel with209// an enableThinking check for the thinking header (keep location gate for context management / tool search).210public getExtraHeaders(_location?: ChatLocation): Record<string, string> {211const headers: Record<string, string> = { ...this.modelMetadata.requestHeaders };212213if (this.useMessagesApi) {214215const modelProviderPreference = this._configurationService.getConfig(ConfigKey.TeamInternal.ModelProviderPreference);216if (modelProviderPreference) {217headers['X-Model-Provider-Preference'] = modelProviderPreference;218}219220const betas: string[] = [];221222if (!this.supportsAdaptiveThinking) {223betas.push('interleaved-thinking-2025-05-14');224}225if (this.supportsToolSearch) {226betas.push('advanced-tool-use-2025-11-20');227}228if (isAnthropicContextEditingEnabled(this, this._configurationService, this._expService)) {229betas.push('context-management-2025-06-27');230}231if (betas.length > 0) {232headers['anthropic-beta'] = betas.join(',');233}234}235236return headers;237}238239public get modelMaxPromptTokens(): number {240return this._maxTokens;241}242243public get maxOutputTokens(): number {244return this._maxOutputTokens;245}246247public get urlOrRequestMetadata(): string | RequestMetadata {248// Use override or respect setting.249// TODO unlikely but would break if it changes in the middle of a request being constructed250return this.modelMetadata.urlOrRequestMetadata ??251(this.useResponsesApi ? { type: RequestType.ChatResponses } :252this.useMessagesApi ? { type: RequestType.ChatMessages } : { type: RequestType.ChatCompletions });253}254255protected get useResponsesApi(): boolean {256if (this.modelMetadata.supported_endpoints257&& !this.modelMetadata.supported_endpoints.includes(ModelSupportedEndpoint.ChatCompletions)258&& this.modelMetadata.supported_endpoints.includes(ModelSupportedEndpoint.Responses)259) {260return true;261}262263return !!this.modelMetadata.supported_endpoints?.includes(ModelSupportedEndpoint.Responses);264}265266protected get useWebSocketResponsesApi(): boolean {267return !!this.modelMetadata.supported_endpoints?.includes(ModelSupportedEndpoint.WebSocketResponses);268}269270protected get useMessagesApi(): boolean {271const enableMessagesApi = this._configurationService.getExperimentBasedConfig(ConfigKey.UseAnthropicMessagesApi, this._expService);272return !!(enableMessagesApi && this.modelMetadata.supported_endpoints?.includes(ModelSupportedEndpoint.Messages));273}274275public get degradationReason(): string | undefined {276return this.modelMetadata.warning_messages?.at(0)?.message ?? this.modelMetadata.info_messages?.at(0)?.message;277}278279public get apiType(): string {280return this.useResponsesApi ? 'responses' :281this.useMessagesApi ? 'messages' : 'chatCompletions';282}283284interceptBody(body: IEndpointBody | undefined): void {285// Remove tool calls from requests that don't support them286// We really shouldn't make requests to models that don't support tool calls with tools though287if (body && !this.supportsToolCalls) {288delete body['tools'];289}290291// If the model doesn't support streaming, don't ask for a streamed request292if (body && !this._supportsStreaming) {293body.stream = false;294}295296// If it's o1 we must modify the body significantly as the request is very different297if (body?.messages && (this.family.startsWith('o1') || this.model === CHAT_MODEL.O1 || this.model === CHAT_MODEL.O1MINI)) {298const newMessages: CAPIChatMessage[] = body.messages.map((message: CAPIChatMessage): CAPIChatMessage => {299if (message.role === OpenAI.ChatRole.System) {300return {301role: OpenAI.ChatRole.User,302content: message.content,303};304} else {305return message;306}307});308// Add the messages & model back309body['messages'] = newMessages;310}311}312313createRequestBody(options: ICreateEndpointBodyOptions): IEndpointBody {314// Determine per-model image limit for APIs with known restrictions315const imageLimit = this.getImageLimit();316if (imageLimit !== undefined) {317options = { ...options, messages: this.validateAndFilterImages(options.messages, imageLimit) };318}319320if (this.useResponsesApi) {321const body = this._instantiationService.invokeFunction(createResponsesRequestBody, options, this.model, this);322return this.customizeResponsesBody(body);323} else if (this.useMessagesApi) {324const body = this._instantiationService.invokeFunction(createMessagesRequestBody, options, this.model, this);325return this.customizeMessagesBody(body);326} else {327const body = createCapiRequestBody(options, this.model, this.getCompletionsCallback());328return this.customizeCapiBody(body, options);329}330}331332/**333* Returns the model-specific image limit, or `undefined` if no limit applies.334* Anthropic Messages API allows up to 20 images per request; Gemini allows up to 10.335* These are hardcoded based on API documentation rather than model metadata to336* avoid being clamped by unreliable server-provided values.337*/338private getImageLimit(): number | undefined {339if (this.useMessagesApi && isAnthropicFamily(this)) {340return 20;341}342if (isGeminiFamily(this)) {343return 10;344}345return undefined;346}347348/**349* Thin wrapper around {@link filterHistoryImages} retained for test ergonomics.350*/351private validateAndFilterImages(messages: Raw.ChatMessage[], maxImages: number): Raw.ChatMessage[] {352return filterHistoryImages(messages, maxImages);353}354355protected getCompletionsCallback(): RawMessageConversionCallback | undefined {356return undefined;357}358359protected customizeMessagesBody(body: IEndpointBody): IEndpointBody {360return body;361}362363protected customizeResponsesBody(body: IEndpointBody): IEndpointBody {364return body;365}366367protected customizeCapiBody(body: IEndpointBody, options: ICreateEndpointBodyOptions): IEndpointBody {368369// Apply Gemini function calling mode if configured370const hasTools = !!options.requestOptions?.tools?.length;371if (hasTools && this.family.toLowerCase().includes('gemini-3')) {372const geminiFunctionCallingMode = this._configurationService.getExperimentBasedConfig(373ConfigKey.TeamInternal.GeminiFunctionCallingMode,374this._expService375);376// Only override tool_choice if experiment provides a value and user hasn't specified a function call377if (geminiFunctionCallingMode && typeof body.tool_choice !== 'object') {378body.tool_choice = geminiFunctionCallingMode;379}380}381382return body;383}384385public async processResponseFromChatEndpoint(386telemetryService: ITelemetryService,387logService: ILogService,388response: Response,389expectedNumChoices: number,390finishCallback: FinishedCallback,391telemetryData: TelemetryData,392cancellationToken?: CancellationToken | undefined393): Promise<AsyncIterableObject<ChatCompletion>> {394if (this.useResponsesApi) {395const compactionThreshold = getResponsesApiCompactionThreshold(this._configurationService, this._expService, this);396return processResponseFromChatEndpoint(this._instantiationService, telemetryService, logService, response, expectedNumChoices, finishCallback, telemetryData, compactionThreshold);397} else if (this.useMessagesApi) {398return processResponseFromMessagesEndpoint(this._instantiationService, telemetryService, logService, response, finishCallback, telemetryData);399} else if (!this._supportsStreaming) {400return defaultNonStreamChatResponseProcessor(response, finishCallback, telemetryData);401} else {402return defaultChatResponseProcessor(telemetryService, logService, response, expectedNumChoices, finishCallback, telemetryData, cancellationToken);403}404}405406public acquireTokenizer(): ITokenizer {407return this._tokenizerProvider.acquireTokenizer(this);408}409410public async makeChatRequest2(options: IMakeChatRequestOptions, token: CancellationToken): Promise<ChatResponse> {411const useWebSocket = options.useWebSocket ?? !!(412options.turnId413&& options.conversationId414&& this.useWebSocketResponsesApi415&& this._configurationService.getExperimentBasedConfig(ConfigKey.TeamInternal.ResponsesApiWebSocketEnabled, this._expService)416);417const ignoreStatefulMarker = options.ignoreStatefulMarker ?? !(418useWebSocket419&& options.conversationId420&& options.turnId421&& this._chatWebSocketService.hasActiveConnection(options.conversationId)422);423const response = await this._makeChatRequest2({424...options,425useWebSocket,426ignoreStatefulMarker,427}, token);428if (response.type === ChatFetchResponseType.InvalidStatefulMarker) {429return this._makeChatRequest2({430...options,431useWebSocket,432ignoreStatefulMarker: true433}, token);434}435return response;436}437438protected async _makeChatRequest2(options: IMakeChatRequestOptions, token: CancellationToken) {439return this._chatMLFetcher.fetchOne({440requestOptions: {},441...options,442endpoint: this,443}, token);444}445446public async makeChatRequest(447debugName: string,448messages: Raw.ChatMessage[],449finishedCb: FinishedCallback | undefined,450token: CancellationToken,451location: ChatLocation,452source?: Source,453requestOptions?: Omit<OptionalChatRequestParams, 'n'>,454userInitiatedRequest?: boolean,455telemetryProperties?: TelemetryProperties,456): Promise<ChatResponse> {457return this.makeChatRequest2({458debugName,459messages,460finishedCb,461location,462source,463requestOptions,464userInitiatedRequest,465telemetryProperties,466}, token);467}468469public cloneWithTokenOverride(modelMaxPromptTokens: number): IChatEndpoint {470return this._instantiationService.createInstance(471ChatEndpoint,472mixin(deepClone(this.modelMetadata), { capabilities: { limits: { max_prompt_tokens: modelMaxPromptTokens } } }));473}474}475476export class RemoteAgentChatEndpoint extends ChatEndpoint {477constructor(478modelMetadata: IChatModelInformation,479private readonly _requestMetadata: RequestMetadata,480@IDomainService domainService: IDomainService,481@ICAPIClientService capiClientService: ICAPIClientService,482@IFetcherService fetcherService: IFetcherService,483@ITelemetryService telemetryService: ITelemetryService,484@IAuthenticationService authService: IAuthenticationService,485@IChatMLFetcher chatMLFetcher: IChatMLFetcher,486@ITokenizerProvider tokenizerProvider: ITokenizerProvider,487@IInstantiationService instantiationService: IInstantiationService,488@IConfigurationService configService: IConfigurationService,489@IExperimentationService experimentService: IExperimentationService,490@IChatWebSocketManager chatWebSocketService: IChatWebSocketManager,491@ILogService logService: ILogService492) {493super(494modelMetadata,495domainService,496chatMLFetcher,497tokenizerProvider,498instantiationService,499configService,500experimentService,501chatWebSocketService,502logService503);504}505506override processResponseFromChatEndpoint(507telemetryService: ITelemetryService,508logService: ILogService,509response: Response,510expectedNumChoices: number,511finishCallback: FinishedCallback,512telemetryData: TelemetryData,513cancellationToken?: CancellationToken | undefined,514_location?: ChatLocation,515): Promise<AsyncIterableObject<ChatCompletion>> {516// We must override this to a num choices > 1 because remote agents can do internal function calls which emit multiple completions even when N > 1517// It's awful that they do this, but we have to support it518return defaultChatResponseProcessor(telemetryService, logService, response, 2, finishCallback, telemetryData, cancellationToken);519}520521public override get urlOrRequestMetadata() {522return this._requestMetadata;523}524}525526527