Path: blob/main/src/vs/workbench/contrib/chat/electron-browser/builtInTools/fetchPageTool.ts
5263 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { assertNever } from '../../../../../base/common/assert.js';6import { CancellationToken } from '../../../../../base/common/cancellation.js';7import { MarkdownString } from '../../../../../base/common/htmlContent.js';8import { Iterable } from '../../../../../base/common/iterator.js';9import { ResourceSet } from '../../../../../base/common/map.js';10import { extname } from '../../../../../base/common/path.js';11import { URI } from '../../../../../base/common/uri.js';12import { localize } from '../../../../../nls.js';13import { IFileService } from '../../../../../platform/files/common/files.js';14import { IWebContentExtractorService, WebContentExtractResult } from '../../../../../platform/webContentExtractor/common/webContentExtractor.js';15import { detectEncodingFromBuffer } from '../../../../services/textfile/common/encoding.js';16import { ITrustedDomainService } from '../../../url/browser/trustedDomainService.js';17import { IChatService } from '../../common/chatService/chatService.js';18import { ChatImageMimeType } from '../../common/languageModels.js';19import { CountTokensCallback, IPreparedToolInvocation, IToolData, IToolImpl, IToolInvocation, IToolInvocationPreparationContext, IToolResult, IToolResultDataPart, IToolResultTextPart, ToolDataSource, ToolProgress } from '../../common/tools/languageModelToolsService.js';20import { InternalFetchWebPageToolId } from '../../common/tools/builtinTools/tools.js';2122export const FetchWebPageToolData: IToolData = {23id: InternalFetchWebPageToolId,24displayName: 'Fetch Web Page',25canBeReferencedInPrompt: false,26modelDescription: 'Fetches the main content from a web page. This tool is useful for summarizing or analyzing the content of a webpage.',27source: ToolDataSource.Internal,28canRequestPostApproval: true,29canRequestPreApproval: true,30inputSchema: {31type: 'object',32properties: {33urls: {34type: 'array',35items: {36type: 'string',37},38description: localize('fetchWebPage.urlsDescription', 'An array of URLs to fetch content from.')39}40},41required: ['urls']42}43};4445export interface IFetchWebPageToolParams {46urls?: string[];47}4849type ResultType = string | { type: 'tooldata'; value: IToolResultDataPart } | { type: 'extracted'; value: WebContentExtractResult } | undefined;5051export class FetchWebPageTool implements IToolImpl {5253constructor(54@IWebContentExtractorService private readonly _readerModeService: IWebContentExtractorService,55@IFileService private readonly _fileService: IFileService,56@ITrustedDomainService private readonly _trustedDomainService: ITrustedDomainService,57@IChatService private readonly _chatService: IChatService,58) { }5960async invoke(invocation: IToolInvocation, _countTokens: CountTokensCallback, _progress: ToolProgress, token: CancellationToken): Promise<IToolResult> {61const urls = (invocation.parameters as IFetchWebPageToolParams).urls || [];62const { webUris, fileUris, invalidUris } = this._parseUris(urls);63const allValidUris = [...webUris.values(), ...fileUris.values()];6465if (!allValidUris.length && invalidUris.size === 0) {66return {67content: [{ kind: 'text', value: localize('fetchWebPage.noValidUrls', 'No valid URLs provided.') }]68};69}7071// Get contents from web URIs72let webContents: WebContentExtractResult[] = [];73if (webUris.size > 0) {74const trustedDomains = this._trustedDomainService.trustedDomains;75webContents = await this._readerModeService.extract([...webUris.values()], { trustedDomains });76}7778// Get contents from file URIs79const fileContents: (string | { type: 'tooldata'; value: IToolResultDataPart } | undefined)[] = [];80const successfulFileUris: URI[] = [];81for (const uri of fileUris.values()) {82try {83const fileContent = await this._fileService.readFile(uri, undefined, token);8485// Check if this is a supported image type first86const imageMimeType = this._getSupportedImageMimeType(uri);87if (imageMimeType) {88// For supported image files, return as IToolResultDataPart89fileContents.push({90type: 'tooldata',91value: {92kind: 'data',93value: {94mimeType: imageMimeType,95data: fileContent.value96}97}98});99} else {100// Check if the content is binary101const detected = detectEncodingFromBuffer({ buffer: fileContent.value, bytesRead: fileContent.value.byteLength });102103if (detected.seemsBinary) {104// For binary files, return a message indicating they're not supported105// We do this for now until the tools that leverage this internal tool can support binary content106fileContents.push(localize('fetchWebPage.binaryNotSupported', 'Binary files are not supported at the moment.'));107} else {108// For text files, convert to string109fileContents.push(fileContent.value.toString());110}111}112113successfulFileUris.push(uri);114} catch (error) {115// If file service can't read it, treat as invalid116fileContents.push(undefined);117}118}119120// Build results array in original order121const results: ResultType[] = [];122let webIndex = 0;123let fileIndex = 0;124for (const url of urls) {125if (invalidUris.has(url)) {126results.push(undefined);127} else if (webUris.has(url)) {128results.push({ type: 'extracted', value: webContents[webIndex] });129webIndex++;130} else if (fileUris.has(url)) {131results.push(fileContents[fileIndex]);132fileIndex++;133} else {134results.push(undefined);135}136}137138// Skip confirming any results if every web content we got was an error or redirect139let confirmResults: undefined | boolean;140if (webContents.every(e => e.status === 'error' || e.status === 'redirect')) {141confirmResults = false;142}143144145// Only include URIs that actually had content successfully fetched146const actuallyValidUris = [...webUris.values(), ...successfulFileUris];147148return {149content: this._getPromptPartsForResults(urls, results),150toolResultDetails: actuallyValidUris,151confirmResults,152};153}154155async prepareToolInvocation(context: IToolInvocationPreparationContext, token: CancellationToken): Promise<IPreparedToolInvocation | undefined> {156const { webUris, fileUris, invalidUris } = this._parseUris(context.parameters.urls);157158// Check which file URIs can actually be read159const validFileUris: URI[] = [];160const additionalInvalidUrls: string[] = [];161for (const [originalUrl, uri] of fileUris.entries()) {162try {163await this._fileService.stat(uri);164validFileUris.push(uri);165} catch (error) {166// If file service can't stat it, treat as invalid167additionalInvalidUrls.push(originalUrl);168}169}170171const invalid = [...Array.from(invalidUris), ...additionalInvalidUrls];172const urlsNeedingConfirmation = new ResourceSet([...webUris.values(), ...validFileUris]);173174const pastTenseMessage = invalid.length175? invalid.length > 1176// If there are multiple invalid URLs, show them all177? new MarkdownString(178localize(179'fetchWebPage.pastTenseMessage.plural',180'Fetched {0} resources, but the following were invalid URLs:\n\n{1}\n\n', urlsNeedingConfirmation.size, invalid.map(url => `- ${url}`).join('\n')181))182// If there is only one invalid URL, show it183: new MarkdownString(184localize(185'fetchWebPage.pastTenseMessage.singular',186'Fetched resource, but the following was an invalid URL:\n\n{0}\n\n', invalid[0]187))188// No invalid URLs189: new MarkdownString();190191const invocationMessage = new MarkdownString();192if (urlsNeedingConfirmation.size > 1) {193pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.plural', 'Fetched {0} resources', urlsNeedingConfirmation.size));194invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.plural', 'Fetching {0} resources', urlsNeedingConfirmation.size));195} else if (urlsNeedingConfirmation.size === 1) {196const url = Iterable.first(urlsNeedingConfirmation)!.toString(true);197// If the URL is too long or it's a file url, show it as a link... otherwise, show it as plain text198if (url.length > 400 || validFileUris.length === 1) {199pastTenseMessage.appendMarkdown(localize({200key: 'fetchWebPage.pastTenseMessageResult.singularAsLink',201comment: [202// Make sure the link syntax is correct203'{Locked="]({0})"}',204]205}, 'Fetched [resource]({0})', url));206invocationMessage.appendMarkdown(localize({207key: 'fetchWebPage.invocationMessage.singularAsLink',208comment: [209// Make sure the link syntax is correct210'{Locked="]({0})"}',211]212}, 'Fetching [resource]({0})', url));213} else {214pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.singular', 'Fetched {0}', url));215invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.singular', 'Fetching {0}', url));216}217}218219let confirmationNotNeededReason: string | undefined;220if (context.chatSessionResource) {221const model = this._chatService.getSession(context.chatSessionResource);222const userMessages = model?.getRequests().map(r => r.message.text.toLowerCase());223let urlsMentionedInPrompt = false;224for (const uri of urlsNeedingConfirmation) {225// Normalize to lowercase and remove any trailing slash226const toToCheck = uri.toString(true).toLowerCase().replace(/\/$/, '');227if (userMessages?.some(m => m.includes(toToCheck))) {228urlsNeedingConfirmation.delete(uri);229urlsMentionedInPrompt = true;230}231}232if (urlsMentionedInPrompt && urlsNeedingConfirmation.size === 0) {233confirmationNotNeededReason = localize('fetchWebPage.urlMentionedInPrompt', 'Auto approved because URL was in prompt');234}235}236237const result: IPreparedToolInvocation = { invocationMessage, pastTenseMessage };238const allDomainsTrusted = Iterable.every(urlsNeedingConfirmation, u => this._trustedDomainService.isValid(u));239let confirmationTitle: string | undefined;240let confirmationMessage: string | MarkdownString | undefined;241242if (urlsNeedingConfirmation.size && !allDomainsTrusted) {243if (urlsNeedingConfirmation.size === 1) {244confirmationTitle = localize('fetchWebPage.confirmationTitle.singular', 'Fetch web page?');245confirmationMessage = new MarkdownString(246Iterable.first(urlsNeedingConfirmation)!.toString(true),247{ supportThemeIcons: true }248);249} else {250confirmationTitle = localize('fetchWebPage.confirmationTitle.plural', 'Fetch web pages?');251confirmationMessage = new MarkdownString(252[...urlsNeedingConfirmation].map(uri => `- ${uri.toString(true)}`).join('\n'),253{ supportThemeIcons: true }254);255}256}257result.confirmationMessages = {258title: confirmationTitle,259message: confirmationMessage,260confirmResults: urlsNeedingConfirmation.size > 0,261allowAutoConfirm: true,262disclaimer: new MarkdownString('$(info) ' + localize('fetchWebPage.confirmationMessage.plural', 'Web content may contain malicious code or attempt prompt injection attacks.'), { supportThemeIcons: true }),263confirmationNotNeededReason264};265return result;266}267268private _parseUris(urls?: string[]): { webUris: Map<string, URI>; fileUris: Map<string, URI>; invalidUris: Set<string> } {269const webUris = new Map<string, URI>();270const fileUris = new Map<string, URI>();271const invalidUris = new Set<string>();272273urls?.forEach(url => {274try {275const uriObj = URI.parse(url);276if (uriObj.scheme === 'http' || uriObj.scheme === 'https') {277webUris.set(url, uriObj);278} else {279// Try to handle other schemes via file service280fileUris.set(url, uriObj);281}282} catch (e) {283invalidUris.add(url);284}285});286287return { webUris, fileUris, invalidUris };288}289290private _getPromptPartsForResults(urls: string[], results: ResultType[]): (IToolResultTextPart | IToolResultDataPart)[] {291return results.map((value, i) => {292const title = results.length > 1 ? localize('fetchWebPage.fetchedFrom', 'Fetched from {0}', urls[i]) : undefined;293if (!value) {294return {295kind: 'text',296title,297value: localize('fetchWebPage.invalidUrl', 'Invalid URL')298};299} else if (typeof value === 'string') {300return {301kind: 'text',302title,303value: value304};305} else if (value.type === 'tooldata') {306return { ...value.value, title };307} else if (value.type === 'extracted') {308switch (value.value.status) {309case 'ok':310return { kind: 'text', title, value: value.value.result };311case 'redirect':312return { kind: 'text', title, value: `The webpage has redirected to "${value.value.toURI.toString(true)}". Use the ${InternalFetchWebPageToolId} again to get its contents.` };313case 'error':314return { kind: 'text', title, value: `An error occurred retrieving the fetch result: ${value.value.error}` };315default:316assertNever(value.value);317}318} else {319throw new Error('unreachable');320}321});322}323324private _getSupportedImageMimeType(uri: URI): ChatImageMimeType | undefined {325const ext = extname(uri.path).toLowerCase();326switch (ext) {327case '.png':328return ChatImageMimeType.PNG;329case '.jpg':330case '.jpeg':331return ChatImageMimeType.JPEG;332case '.gif':333return ChatImageMimeType.GIF;334case '.webp':335return ChatImageMimeType.WEBP;336case '.bmp':337return ChatImageMimeType.BMP;338default:339return undefined;340}341}342}343344345