Path: blob/main/src/vs/workbench/contrib/chat/electron-browser/builtInTools/fetchPageTool.ts
4780 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { assertNever } from '../../../../../base/common/assert.js';6import { CancellationToken } from '../../../../../base/common/cancellation.js';7import { MarkdownString } from '../../../../../base/common/htmlContent.js';8import { Iterable } from '../../../../../base/common/iterator.js';9import { ResourceSet } from '../../../../../base/common/map.js';10import { extname } from '../../../../../base/common/path.js';11import { URI } from '../../../../../base/common/uri.js';12import { localize } from '../../../../../nls.js';13import { IFileService } from '../../../../../platform/files/common/files.js';14import { IWebContentExtractorService, WebContentExtractResult } from '../../../../../platform/webContentExtractor/common/webContentExtractor.js';15import { detectEncodingFromBuffer } from '../../../../services/textfile/common/encoding.js';16import { ITrustedDomainService } from '../../../url/browser/trustedDomainService.js';17import { IChatService } from '../../common/chatService/chatService.js';18import { LocalChatSessionUri } from '../../common/model/chatUri.js';19import { ChatImageMimeType } from '../../common/languageModels.js';20import { CountTokensCallback, IPreparedToolInvocation, IToolData, IToolImpl, IToolInvocation, IToolInvocationPreparationContext, IToolResult, IToolResultDataPart, IToolResultTextPart, ToolDataSource, ToolProgress } from '../../common/tools/languageModelToolsService.js';21import { InternalFetchWebPageToolId } from '../../common/tools/builtinTools/tools.js';2223export const FetchWebPageToolData: IToolData = {24id: InternalFetchWebPageToolId,25displayName: 'Fetch Web Page',26canBeReferencedInPrompt: false,27modelDescription: 'Fetches the main content from a web page. This tool is useful for summarizing or analyzing the content of a webpage.',28source: ToolDataSource.Internal,29canRequestPostApproval: true,30canRequestPreApproval: true,31inputSchema: {32type: 'object',33properties: {34urls: {35type: 'array',36items: {37type: 'string',38},39description: localize('fetchWebPage.urlsDescription', 'An array of URLs to fetch content from.')40}41},42required: ['urls']43}44};4546export interface IFetchWebPageToolParams {47urls?: string[];48}4950type ResultType = string | { type: 'tooldata'; value: IToolResultDataPart } | { type: 'extracted'; value: WebContentExtractResult } | undefined;5152export class FetchWebPageTool implements IToolImpl {5354constructor(55@IWebContentExtractorService private readonly _readerModeService: IWebContentExtractorService,56@IFileService private readonly _fileService: IFileService,57@ITrustedDomainService private readonly _trustedDomainService: ITrustedDomainService,58@IChatService private readonly _chatService: IChatService,59) { }6061async invoke(invocation: IToolInvocation, _countTokens: CountTokensCallback, _progress: ToolProgress, token: CancellationToken): Promise<IToolResult> {62const urls = (invocation.parameters as IFetchWebPageToolParams).urls || [];63const { webUris, fileUris, invalidUris } = this._parseUris(urls);64const allValidUris = [...webUris.values(), ...fileUris.values()];6566if (!allValidUris.length && invalidUris.size === 0) {67return {68content: [{ kind: 'text', value: localize('fetchWebPage.noValidUrls', 'No valid URLs provided.') }]69};70}7172// Get contents from web URIs73let webContents: WebContentExtractResult[] = [];74if (webUris.size > 0) {75const trustedDomains = this._trustedDomainService.trustedDomains;76webContents = await this._readerModeService.extract([...webUris.values()], { trustedDomains });77}7879// Get contents from file URIs80const fileContents: (string | { type: 'tooldata'; value: IToolResultDataPart } | undefined)[] = [];81const successfulFileUris: URI[] = [];82for (const uri of fileUris.values()) {83try {84const fileContent = await this._fileService.readFile(uri, undefined, token);8586// Check if this is a supported image type first87const imageMimeType = this._getSupportedImageMimeType(uri);88if (imageMimeType) {89// For supported image files, return as IToolResultDataPart90fileContents.push({91type: 'tooldata',92value: {93kind: 'data',94value: {95mimeType: imageMimeType,96data: fileContent.value97}98}99});100} else {101// Check if the content is binary102const detected = detectEncodingFromBuffer({ buffer: fileContent.value, bytesRead: fileContent.value.byteLength });103104if (detected.seemsBinary) {105// For binary files, return a message indicating they're not supported106// We do this for now until the tools that leverage this internal tool can support binary content107fileContents.push(localize('fetchWebPage.binaryNotSupported', 'Binary files are not supported at the moment.'));108} else {109// For text files, convert to string110fileContents.push(fileContent.value.toString());111}112}113114successfulFileUris.push(uri);115} catch (error) {116// If file service can't read it, treat as invalid117fileContents.push(undefined);118}119}120121// Build results array in original order122const results: ResultType[] = [];123let webIndex = 0;124let fileIndex = 0;125for (const url of urls) {126if (invalidUris.has(url)) {127results.push(undefined);128} else if (webUris.has(url)) {129results.push({ type: 'extracted', value: webContents[webIndex] });130webIndex++;131} else if (fileUris.has(url)) {132results.push(fileContents[fileIndex]);133fileIndex++;134} else {135results.push(undefined);136}137}138139// Skip confirming any results if every web content we got was an error or redirect140let confirmResults: undefined | boolean;141if (webContents.every(e => e.status === 'error' || e.status === 'redirect')) {142confirmResults = false;143}144145146// Only include URIs that actually had content successfully fetched147const actuallyValidUris = [...webUris.values(), ...successfulFileUris];148149return {150content: this._getPromptPartsForResults(urls, results),151toolResultDetails: actuallyValidUris,152confirmResults,153};154}155156async prepareToolInvocation(context: IToolInvocationPreparationContext, token: CancellationToken): Promise<IPreparedToolInvocation | undefined> {157const { webUris, fileUris, invalidUris } = this._parseUris(context.parameters.urls);158159// Check which file URIs can actually be read160const validFileUris: URI[] = [];161const additionalInvalidUrls: string[] = [];162for (const [originalUrl, uri] of fileUris.entries()) {163try {164await this._fileService.stat(uri);165validFileUris.push(uri);166} catch (error) {167// If file service can't stat it, treat as invalid168additionalInvalidUrls.push(originalUrl);169}170}171172const invalid = [...Array.from(invalidUris), ...additionalInvalidUrls];173const urlsNeedingConfirmation = new ResourceSet([...webUris.values(), ...validFileUris]);174175const pastTenseMessage = invalid.length176? invalid.length > 1177// If there are multiple invalid URLs, show them all178? new MarkdownString(179localize(180'fetchWebPage.pastTenseMessage.plural',181'Fetched {0} resources, but the following were invalid URLs:\n\n{1}\n\n', urlsNeedingConfirmation.size, invalid.map(url => `- ${url}`).join('\n')182))183// If there is only one invalid URL, show it184: new MarkdownString(185localize(186'fetchWebPage.pastTenseMessage.singular',187'Fetched resource, but the following was an invalid URL:\n\n{0}\n\n', invalid[0]188))189// No invalid URLs190: new MarkdownString();191192const invocationMessage = new MarkdownString();193if (urlsNeedingConfirmation.size > 1) {194pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.plural', 'Fetched {0} resources', urlsNeedingConfirmation.size));195invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.plural', 'Fetching {0} resources', urlsNeedingConfirmation.size));196} else if (urlsNeedingConfirmation.size === 1) {197const url = Iterable.first(urlsNeedingConfirmation)!.toString(true);198// If the URL is too long or it's a file url, show it as a link... otherwise, show it as plain text199if (url.length > 400 || validFileUris.length === 1) {200pastTenseMessage.appendMarkdown(localize({201key: 'fetchWebPage.pastTenseMessageResult.singularAsLink',202comment: [203// Make sure the link syntax is correct204'{Locked="]({0})"}',205]206}, 'Fetched [resource]({0})', url));207invocationMessage.appendMarkdown(localize({208key: 'fetchWebPage.invocationMessage.singularAsLink',209comment: [210// Make sure the link syntax is correct211'{Locked="]({0})"}',212]213}, 'Fetching [resource]({0})', url));214} else {215pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.singular', 'Fetched {0}', url));216invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.singular', 'Fetching {0}', url));217}218}219220let confirmationNotNeededReason: string | undefined;221if (context.chatSessionId) {222const model = this._chatService.getSession(LocalChatSessionUri.forSession(context.chatSessionId));223const userMessages = model?.getRequests().map(r => r.message.text.toLowerCase());224let urlsMentionedInPrompt = false;225for (const uri of urlsNeedingConfirmation) {226// Normalize to lowercase and remove any trailing slash227const toToCheck = uri.toString(true).toLowerCase().replace(/\/$/, '');228if (userMessages?.some(m => m.includes(toToCheck))) {229urlsNeedingConfirmation.delete(uri);230urlsMentionedInPrompt = true;231}232}233if (urlsMentionedInPrompt && urlsNeedingConfirmation.size === 0) {234confirmationNotNeededReason = localize('fetchWebPage.urlMentionedInPrompt', 'Auto approved because URL was in prompt');235}236}237238const result: IPreparedToolInvocation = { invocationMessage, pastTenseMessage };239const allDomainsTrusted = Iterable.every(urlsNeedingConfirmation, u => this._trustedDomainService.isValid(u));240let confirmationTitle: string | undefined;241let confirmationMessage: string | MarkdownString | undefined;242243if (urlsNeedingConfirmation.size && !allDomainsTrusted) {244if (urlsNeedingConfirmation.size === 1) {245confirmationTitle = localize('fetchWebPage.confirmationTitle.singular', 'Fetch web page?');246confirmationMessage = new MarkdownString(247Iterable.first(urlsNeedingConfirmation)!.toString(true),248{ supportThemeIcons: true }249);250} else {251confirmationTitle = localize('fetchWebPage.confirmationTitle.plural', 'Fetch web pages?');252confirmationMessage = new MarkdownString(253[...urlsNeedingConfirmation].map(uri => `- ${uri.toString(true)}`).join('\n'),254{ supportThemeIcons: true }255);256}257}258result.confirmationMessages = {259title: confirmationTitle,260message: confirmationMessage,261confirmResults: urlsNeedingConfirmation.size > 0,262allowAutoConfirm: true,263disclaimer: new MarkdownString('$(info) ' + localize('fetchWebPage.confirmationMessage.plural', 'Web content may contain malicious code or attempt prompt injection attacks.'), { supportThemeIcons: true }),264confirmationNotNeededReason265};266return result;267}268269private _parseUris(urls?: string[]): { webUris: Map<string, URI>; fileUris: Map<string, URI>; invalidUris: Set<string> } {270const webUris = new Map<string, URI>();271const fileUris = new Map<string, URI>();272const invalidUris = new Set<string>();273274urls?.forEach(url => {275try {276const uriObj = URI.parse(url);277if (uriObj.scheme === 'http' || uriObj.scheme === 'https') {278webUris.set(url, uriObj);279} else {280// Try to handle other schemes via file service281fileUris.set(url, uriObj);282}283} catch (e) {284invalidUris.add(url);285}286});287288return { webUris, fileUris, invalidUris };289}290291private _getPromptPartsForResults(urls: string[], results: ResultType[]): (IToolResultTextPart | IToolResultDataPart)[] {292return results.map((value, i) => {293const title = results.length > 1 ? localize('fetchWebPage.fetchedFrom', 'Fetched from {0}', urls[i]) : undefined;294if (!value) {295return {296kind: 'text',297title,298value: localize('fetchWebPage.invalidUrl', 'Invalid URL')299};300} else if (typeof value === 'string') {301return {302kind: 'text',303title,304value: value305};306} else if (value.type === 'tooldata') {307return { ...value.value, title };308} else if (value.type === 'extracted') {309switch (value.value.status) {310case 'ok':311return { kind: 'text', title, value: value.value.result };312case 'redirect':313return { kind: 'text', title, value: `The webpage has redirected to "${value.value.toURI.toString(true)}". Use the ${InternalFetchWebPageToolId} again to get its contents.` };314case 'error':315return { kind: 'text', title, value: `An error occurred retrieving the fetch result: ${value.value.error}` };316default:317assertNever(value.value);318}319} else {320throw new Error('unreachable');321}322});323}324325private _getSupportedImageMimeType(uri: URI): ChatImageMimeType | undefined {326const ext = extname(uri.path).toLowerCase();327switch (ext) {328case '.png':329return ChatImageMimeType.PNG;330case '.jpg':331case '.jpeg':332return ChatImageMimeType.JPEG;333case '.gif':334return ChatImageMimeType.GIF;335case '.webp':336return ChatImageMimeType.WEBP;337case '.bmp':338return ChatImageMimeType.BMP;339default:340return undefined;341}342}343}344345346