Path: blob/main/src/vs/workbench/contrib/chat/electron-browser/tools/fetchPageTool.ts
3244 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { CancellationToken } from '../../../../../base/common/cancellation.js';6import { MarkdownString } from '../../../../../base/common/htmlContent.js';7import { ResourceSet } from '../../../../../base/common/map.js';8import { extname } from '../../../../../base/common/path.js';9import { URI } from '../../../../../base/common/uri.js';10import { localize } from '../../../../../nls.js';11import { IFileService } from '../../../../../platform/files/common/files.js';12import { IWebContentExtractorService } from '../../../../../platform/webContentExtractor/common/webContentExtractor.js';13import { detectEncodingFromBuffer } from '../../../../services/textfile/common/encoding.js';14import { ChatImageMimeType } from '../../common/languageModels.js';15import { CountTokensCallback, IPreparedToolInvocation, IToolData, IToolImpl, IToolInvocation, IToolInvocationPreparationContext, IToolResult, IToolResultDataPart, IToolResultTextPart, ToolDataSource, ToolProgress } from '../../common/languageModelToolsService.js';16import { InternalFetchWebPageToolId } from '../../common/tools/tools.js';1718export const FetchWebPageToolData: IToolData = {19id: InternalFetchWebPageToolId,20displayName: 'Fetch Web Page',21canBeReferencedInPrompt: false,22modelDescription: localize('fetchWebPage.modelDescription', 'Fetches the main content from a web page. This tool is useful for summarizing or analyzing the content of a webpage.'),23source: ToolDataSource.Internal,24inputSchema: {25type: 'object',26properties: {27urls: {28type: 'array',29items: {30type: 'string',31},32description: localize('fetchWebPage.urlsDescription', 'An array of URLs to fetch content from.')33}34},35required: ['urls']36}37};3839export class FetchWebPageTool implements IToolImpl {40private _alreadyApprovedDomains = new ResourceSet();4142constructor(43@IWebContentExtractorService private readonly _readerModeService: IWebContentExtractorService,44@IFileService private readonly _fileService: IFileService,45) { }4647async invoke(invocation: IToolInvocation, _countTokens: CountTokensCallback, _progress: ToolProgress, token: CancellationToken): Promise<IToolResult> {48const urls = (invocation.parameters as { urls?: string[] }).urls || [];49const { webUris, fileUris, invalidUris } = this._parseUris(urls);50const allValidUris = [...webUris.values(), ...fileUris.values()];5152if (!allValidUris.length && invalidUris.size === 0) {53return {54content: [{ kind: 'text', value: localize('fetchWebPage.noValidUrls', 'No valid URLs provided.') }]55};56}5758// We approved these via confirmation, so mark them as "approved" in this session59// if they are not approved via the trusted domain service.60for (const uri of webUris.values()) {61this._alreadyApprovedDomains.add(uri);62}6364// Get contents from web URIs65const webContents = webUris.size > 0 ? await this._readerModeService.extract([...webUris.values()]) : [];6667// Get contents from file URIs68const fileContents: (string | IToolResultDataPart | undefined)[] = [];69const successfulFileUris: URI[] = [];70for (const uri of fileUris.values()) {71try {72const fileContent = await this._fileService.readFile(uri, undefined, token);7374// Check if this is a supported image type first75const imageMimeType = this._getSupportedImageMimeType(uri);76if (imageMimeType) {77// For supported image files, return as IToolResultDataPart78fileContents.push({79kind: 'data',80value: {81mimeType: imageMimeType,82data: fileContent.value83}84});85} else {86// Check if the content is binary87const detected = detectEncodingFromBuffer({ buffer: fileContent.value, bytesRead: fileContent.value.byteLength });8889if (detected.seemsBinary) {90// For binary files, return a message indicating they're not supported91// We do this for now until the tools that leverage this internal tool can support binary content92fileContents.push(localize('fetchWebPage.binaryNotSupported', 'Binary files are not supported at the moment.'));93} else {94// For text files, convert to string95fileContents.push(fileContent.value.toString());96}97}9899successfulFileUris.push(uri);100} catch (error) {101// If file service can't read it, treat as invalid102fileContents.push(undefined);103}104}105106// Build results array in original order107const results: (string | IToolResultDataPart | undefined)[] = [];108let webIndex = 0;109let fileIndex = 0;110for (const url of urls) {111if (invalidUris.has(url)) {112results.push(undefined);113} else if (webUris.has(url)) {114results.push(webContents[webIndex]);115webIndex++;116} else if (fileUris.has(url)) {117results.push(fileContents[fileIndex]);118fileIndex++;119} else {120results.push(undefined);121}122}123124// Only include URIs that actually had content successfully fetched125const actuallyValidUris = [...webUris.values(), ...successfulFileUris];126127return {128content: this._getPromptPartsForResults(results),129toolResultDetails: actuallyValidUris130};131}132133async prepareToolInvocation(context: IToolInvocationPreparationContext, token: CancellationToken): Promise<IPreparedToolInvocation | undefined> {134const { webUris, fileUris, invalidUris } = this._parseUris(context.parameters.urls);135136// Check which file URIs can actually be read137const validFileUris: URI[] = [];138const additionalInvalidUrls: string[] = [];139for (const [originalUrl, uri] of fileUris.entries()) {140try {141await this._fileService.stat(uri);142validFileUris.push(uri);143} catch (error) {144// If file service can't stat it, treat as invalid145additionalInvalidUrls.push(originalUrl);146}147}148149const invalid = [...Array.from(invalidUris), ...additionalInvalidUrls];150const valid = [...webUris.values(), ...validFileUris];151const urlsNeedingConfirmation = valid.length > 0 ? valid.filter(url => !this._alreadyApprovedDomains.has(url)) : [];152153const pastTenseMessage = invalid.length154? invalid.length > 1155// If there are multiple invalid URLs, show them all156? new MarkdownString(157localize(158'fetchWebPage.pastTenseMessage.plural',159'Fetched {0} resources, but the following were invalid URLs:\n\n{1}\n\n', valid.length, invalid.map(url => `- ${url}`).join('\n')160))161// If there is only one invalid URL, show it162: new MarkdownString(163localize(164'fetchWebPage.pastTenseMessage.singular',165'Fetched resource, but the following was an invalid URL:\n\n{0}\n\n', invalid[0]166))167// No invalid URLs168: new MarkdownString();169170const invocationMessage = new MarkdownString();171if (valid.length > 1) {172pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.plural', 'Fetched {0} resources', valid.length));173invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.plural', 'Fetching {0} resources', valid.length));174} else if (valid.length === 1) {175const url = valid[0].toString();176// If the URL is too long or it's a file url, show it as a link... otherwise, show it as plain text177if (url.length > 400 || validFileUris.length === 1) {178pastTenseMessage.appendMarkdown(localize({179key: 'fetchWebPage.pastTenseMessageResult.singularAsLink',180comment: [181// Make sure the link syntax is correct182'{Locked="]({0})"}',183]184}, 'Fetched [resource]({0})', url));185invocationMessage.appendMarkdown(localize({186key: 'fetchWebPage.invocationMessage.singularAsLink',187comment: [188// Make sure the link syntax is correct189'{Locked="]({0})"}',190]191}, 'Fetching [resource]({0})', url));192} else {193pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.singular', 'Fetched {0}', url));194invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.singular', 'Fetching {0}', url));195}196}197198const result: IPreparedToolInvocation = { invocationMessage, pastTenseMessage };199if (urlsNeedingConfirmation.length) {200let confirmationTitle: string;201let confirmationMessage: string | MarkdownString;202if (urlsNeedingConfirmation.length === 1) {203confirmationTitle = localize('fetchWebPage.confirmationTitle.singular', 'Fetch web page?');204confirmationMessage = new MarkdownString(205urlsNeedingConfirmation[0].toString(),206{ supportThemeIcons: true }207);208} else {209confirmationTitle = localize('fetchWebPage.confirmationTitle.plural', 'Fetch web pages?');210confirmationMessage = new MarkdownString(211urlsNeedingConfirmation.map(uri => `- ${uri.toString()}`).join('\n'),212{ supportThemeIcons: true }213);214}215result.confirmationMessages = {216title: confirmationTitle,217message: confirmationMessage,218allowAutoConfirm: true,219disclaimer: new MarkdownString('$(info) ' + localize('fetchWebPage.confirmationMessage.plural', 'Web content may contain malicious code or attempt prompt injection attacks.'), { supportThemeIcons: true })220};221}222return result;223}224225private _parseUris(urls?: string[]): { webUris: Map<string, URI>; fileUris: Map<string, URI>; invalidUris: Set<string> } {226const webUris = new Map<string, URI>();227const fileUris = new Map<string, URI>();228const invalidUris = new Set<string>();229230urls?.forEach(url => {231try {232const uriObj = URI.parse(url);233if (uriObj.scheme === 'http' || uriObj.scheme === 'https') {234webUris.set(url, uriObj);235} else {236// Try to handle other schemes via file service237fileUris.set(url, uriObj);238}239} catch (e) {240invalidUris.add(url);241}242});243244return { webUris, fileUris, invalidUris };245}246247private _getPromptPartsForResults(results: (string | IToolResultDataPart | undefined)[]): (IToolResultTextPart | IToolResultDataPart)[] {248return results.map(value => {249if (!value) {250return {251kind: 'text',252value: localize('fetchWebPage.invalidUrl', 'Invalid URL')253};254} else if (typeof value === 'string') {255return {256kind: 'text',257value: value258};259} else {260// This is an IToolResultDataPart261return value;262}263});264}265266private _getSupportedImageMimeType(uri: URI): ChatImageMimeType | undefined {267const ext = extname(uri.path).toLowerCase();268switch (ext) {269case '.png':270return ChatImageMimeType.PNG;271case '.jpg':272case '.jpeg':273return ChatImageMimeType.JPEG;274case '.gif':275return ChatImageMimeType.GIF;276case '.webp':277return ChatImageMimeType.WEBP;278case '.bmp':279return ChatImageMimeType.BMP;280default:281return undefined;282}283}284}285286287