CoCalc -- fetchPageTool.ts

GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/workbench/contrib/chat/electron-browser/builtInTools/fetchPageTool.ts
⁵²⁶³ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
import { assertNever } from '../../../../../base/common/assert.js';
7
import { CancellationToken } from '../../../../../base/common/cancellation.js';
8
import { MarkdownString } from '../../../../../base/common/htmlContent.js';
9
import { Iterable } from '../../../../../base/common/iterator.js';
10
import { ResourceSet } from '../../../../../base/common/map.js';
11
import { extname } from '../../../../../base/common/path.js';
12
import { URI } from '../../../../../base/common/uri.js';
13
import { localize } from '../../../../../nls.js';
14
import { IFileService } from '../../../../../platform/files/common/files.js';
15
import { IWebContentExtractorService, WebContentExtractResult } from '../../../../../platform/webContentExtractor/common/webContentExtractor.js';
16
import { detectEncodingFromBuffer } from '../../../../services/textfile/common/encoding.js';
17
import { ITrustedDomainService } from '../../../url/browser/trustedDomainService.js';
18
import { IChatService } from '../../common/chatService/chatService.js';
19
import { ChatImageMimeType } from '../../common/languageModels.js';
20
import { CountTokensCallback, IPreparedToolInvocation, IToolData, IToolImpl, IToolInvocation, IToolInvocationPreparationContext, IToolResult, IToolResultDataPart, IToolResultTextPart, ToolDataSource, ToolProgress } from '../../common/tools/languageModelToolsService.js';
21
import { InternalFetchWebPageToolId } from '../../common/tools/builtinTools/tools.js';
22

23
export const FetchWebPageToolData: IToolData = {
24
	id: InternalFetchWebPageToolId,
25
	displayName: 'Fetch Web Page',
26
	canBeReferencedInPrompt: false,
27
	modelDescription: 'Fetches the main content from a web page. This tool is useful for summarizing or analyzing the content of a webpage.',
28
	source: ToolDataSource.Internal,
29
	canRequestPostApproval: true,
30
	canRequestPreApproval: true,
31
	inputSchema: {
32
		type: 'object',
33
		properties: {
34
			urls: {
35
				type: 'array',
36
				items: {
37
					type: 'string',
38
				},
39
				description: localize('fetchWebPage.urlsDescription', 'An array of URLs to fetch content from.')
40
			}
41
		},
42
		required: ['urls']
43
	}
44
};
45

46
export interface IFetchWebPageToolParams {
47
	urls?: string[];
48
}
49

50
type ResultType = string | { type: 'tooldata'; value: IToolResultDataPart } | { type: 'extracted'; value: WebContentExtractResult } | undefined;
51

52
export class FetchWebPageTool implements IToolImpl {
53

54
	constructor(
55
		@IWebContentExtractorService private readonly _readerModeService: IWebContentExtractorService,
56
		@IFileService private readonly _fileService: IFileService,
57
		@ITrustedDomainService private readonly _trustedDomainService: ITrustedDomainService,
58
		@IChatService private readonly _chatService: IChatService,
59
	) { }
60

61
	async invoke(invocation: IToolInvocation, _countTokens: CountTokensCallback, _progress: ToolProgress, token: CancellationToken): Promise<IToolResult> {
62
		const urls = (invocation.parameters as IFetchWebPageToolParams).urls || [];
63
		const { webUris, fileUris, invalidUris } = this._parseUris(urls);
64
		const allValidUris = [...webUris.values(), ...fileUris.values()];
65

66
		if (!allValidUris.length && invalidUris.size === 0) {
67
			return {
68
				content: [{ kind: 'text', value: localize('fetchWebPage.noValidUrls', 'No valid URLs provided.') }]
69
			};
70
		}
71

72
		// Get contents from web URIs
73
		let webContents: WebContentExtractResult[] = [];
74
		if (webUris.size > 0) {
75
			const trustedDomains = this._trustedDomainService.trustedDomains;
76
			webContents = await this._readerModeService.extract([...webUris.values()], { trustedDomains });
77
		}
78

79
		// Get contents from file URIs
80
		const fileContents: (string | { type: 'tooldata'; value: IToolResultDataPart } | undefined)[] = [];
81
		const successfulFileUris: URI[] = [];
82
		for (const uri of fileUris.values()) {
83
			try {
84
				const fileContent = await this._fileService.readFile(uri, undefined, token);
85

86
				// Check if this is a supported image type first
87
				const imageMimeType = this._getSupportedImageMimeType(uri);
88
				if (imageMimeType) {
89
					// For supported image files, return as IToolResultDataPart
90
					fileContents.push({
91
						type: 'tooldata',
92
						value: {
93
							kind: 'data',
94
							value: {
95
								mimeType: imageMimeType,
96
								data: fileContent.value
97
							}
98
						}
99
					});
100
				} else {
101
					// Check if the content is binary
102
					const detected = detectEncodingFromBuffer({ buffer: fileContent.value, bytesRead: fileContent.value.byteLength });
103

104
					if (detected.seemsBinary) {
105
						// For binary files, return a message indicating they're not supported
106
						// We do this for now until the tools that leverage this internal tool can support binary content
107
						fileContents.push(localize('fetchWebPage.binaryNotSupported', 'Binary files are not supported at the moment.'));
108
					} else {
109
						// For text files, convert to string
110
						fileContents.push(fileContent.value.toString());
111
					}
112
				}
113

114
				successfulFileUris.push(uri);
115
			} catch (error) {
116
				// If file service can't read it, treat as invalid
117
				fileContents.push(undefined);
118
			}
119
		}
120

121
		// Build results array in original order
122
		const results: ResultType[] = [];
123
		let webIndex = 0;
124
		let fileIndex = 0;
125
		for (const url of urls) {
126
			if (invalidUris.has(url)) {
127
				results.push(undefined);
128
			} else if (webUris.has(url)) {
129
				results.push({ type: 'extracted', value: webContents[webIndex] });
130
				webIndex++;
131
			} else if (fileUris.has(url)) {
132
				results.push(fileContents[fileIndex]);
133
				fileIndex++;
134
			} else {
135
				results.push(undefined);
136
			}
137
		}
138

139
		// Skip confirming any results if every web content we got was an error or redirect
140
		let confirmResults: undefined | boolean;
141
		if (webContents.every(e => e.status === 'error' || e.status === 'redirect')) {
142
			confirmResults = false;
143
		}
144

145

146
		// Only include URIs that actually had content successfully fetched
147
		const actuallyValidUris = [...webUris.values(), ...successfulFileUris];
148

149
		return {
150
			content: this._getPromptPartsForResults(urls, results),
151
			toolResultDetails: actuallyValidUris,
152
			confirmResults,
153
		};
154
	}
155

156
	async prepareToolInvocation(context: IToolInvocationPreparationContext, token: CancellationToken): Promise<IPreparedToolInvocation | undefined> {
157
		const { webUris, fileUris, invalidUris } = this._parseUris(context.parameters.urls);
158

159
		// Check which file URIs can actually be read
160
		const validFileUris: URI[] = [];
161
		const additionalInvalidUrls: string[] = [];
162
		for (const [originalUrl, uri] of fileUris.entries()) {
163
			try {
164
				await this._fileService.stat(uri);
165
				validFileUris.push(uri);
166
			} catch (error) {
167
				// If file service can't stat it, treat as invalid
168
				additionalInvalidUrls.push(originalUrl);
169
			}
170
		}
171

172
		const invalid = [...Array.from(invalidUris), ...additionalInvalidUrls];
173
		const urlsNeedingConfirmation = new ResourceSet([...webUris.values(), ...validFileUris]);
174

175
		const pastTenseMessage = invalid.length
176
			? invalid.length > 1
177
				// If there are multiple invalid URLs, show them all
178
				? new MarkdownString(
179
					localize(
180
						'fetchWebPage.pastTenseMessage.plural',
181
						'Fetched {0} resources, but the following were invalid URLs:\n\n{1}\n\n', urlsNeedingConfirmation.size, invalid.map(url => `- ${url}`).join('\n')
182
					))
183
				// If there is only one invalid URL, show it
184
				: new MarkdownString(
185
					localize(
186
						'fetchWebPage.pastTenseMessage.singular',
187
						'Fetched resource, but the following was an invalid URL:\n\n{0}\n\n', invalid[0]
188
					))
189
			// No invalid URLs
190
			: new MarkdownString();
191

192
		const invocationMessage = new MarkdownString();
193
		if (urlsNeedingConfirmation.size > 1) {
194
			pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.plural', 'Fetched {0} resources', urlsNeedingConfirmation.size));
195
			invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.plural', 'Fetching {0} resources', urlsNeedingConfirmation.size));
196
		} else if (urlsNeedingConfirmation.size === 1) {
197
			const url = Iterable.first(urlsNeedingConfirmation)!.toString(true);
198
			// If the URL is too long or it's a file url, show it as a link... otherwise, show it as plain text
199
			if (url.length > 400 || validFileUris.length === 1) {
200
				pastTenseMessage.appendMarkdown(localize({
201
					key: 'fetchWebPage.pastTenseMessageResult.singularAsLink',
202
					comment: [
203
						// Make sure the link syntax is correct
204
						'{Locked="]({0})"}',
205
					]
206
				}, 'Fetched [resource]({0})', url));
207
				invocationMessage.appendMarkdown(localize({
208
					key: 'fetchWebPage.invocationMessage.singularAsLink',
209
					comment: [
210
						// Make sure the link syntax is correct
211
						'{Locked="]({0})"}',
212
					]
213
				}, 'Fetching [resource]({0})', url));
214
			} else {
215
				pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.singular', 'Fetched {0}', url));
216
				invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.singular', 'Fetching {0}', url));
217
			}
218
		}
219

220
		let confirmationNotNeededReason: string | undefined;
221
		if (context.chatSessionResource) {
222
			const model = this._chatService.getSession(context.chatSessionResource);
223
			const userMessages = model?.getRequests().map(r => r.message.text.toLowerCase());
224
			let urlsMentionedInPrompt = false;
225
			for (const uri of urlsNeedingConfirmation) {
226
				// Normalize to lowercase and remove any trailing slash
227
				const toToCheck = uri.toString(true).toLowerCase().replace(/\/$/, '');
228
				if (userMessages?.some(m => m.includes(toToCheck))) {
229
					urlsNeedingConfirmation.delete(uri);
230
					urlsMentionedInPrompt = true;
231
				}
232
			}
233
			if (urlsMentionedInPrompt && urlsNeedingConfirmation.size === 0) {
234
				confirmationNotNeededReason = localize('fetchWebPage.urlMentionedInPrompt', 'Auto approved because URL was in prompt');
235
			}
236
		}
237

238
		const result: IPreparedToolInvocation = { invocationMessage, pastTenseMessage };
239
		const allDomainsTrusted = Iterable.every(urlsNeedingConfirmation, u => this._trustedDomainService.isValid(u));
240
		let confirmationTitle: string | undefined;
241
		let confirmationMessage: string | MarkdownString | undefined;
242

243
		if (urlsNeedingConfirmation.size && !allDomainsTrusted) {
244
			if (urlsNeedingConfirmation.size === 1) {
245
				confirmationTitle = localize('fetchWebPage.confirmationTitle.singular', 'Fetch web page?');
246
				confirmationMessage = new MarkdownString(
247
					Iterable.first(urlsNeedingConfirmation)!.toString(true),
248
					{ supportThemeIcons: true }
249
				);
250
			} else {
251
				confirmationTitle = localize('fetchWebPage.confirmationTitle.plural', 'Fetch web pages?');
252
				confirmationMessage = new MarkdownString(
253
					[...urlsNeedingConfirmation].map(uri => `- ${uri.toString(true)}`).join('\n'),
254
					{ supportThemeIcons: true }
255
				);
256
			}
257
		}
258
		result.confirmationMessages = {
259
			title: confirmationTitle,
260
			message: confirmationMessage,
261
			confirmResults: urlsNeedingConfirmation.size > 0,
262
			allowAutoConfirm: true,
263
			disclaimer: new MarkdownString('$(info) ' + localize('fetchWebPage.confirmationMessage.plural', 'Web content may contain malicious code or attempt prompt injection attacks.'), { supportThemeIcons: true }),
264
			confirmationNotNeededReason
265
		};
266
		return result;
267
	}
268

269
	private _parseUris(urls?: string[]): { webUris: Map<string, URI>; fileUris: Map<string, URI>; invalidUris: Set<string> } {
270
		const webUris = new Map<string, URI>();
271
		const fileUris = new Map<string, URI>();
272
		const invalidUris = new Set<string>();
273

274
		urls?.forEach(url => {
275
			try {
276
				const uriObj = URI.parse(url);
277
				if (uriObj.scheme === 'http' || uriObj.scheme === 'https') {
278
					webUris.set(url, uriObj);
279
				} else {
280
					// Try to handle other schemes via file service
281
					fileUris.set(url, uriObj);
282
				}
283
			} catch (e) {
284
				invalidUris.add(url);
285
			}
286
		});
287

288
		return { webUris, fileUris, invalidUris };
289
	}
290

291
	private _getPromptPartsForResults(urls: string[], results: ResultType[]): (IToolResultTextPart | IToolResultDataPart)[] {
292
		return results.map((value, i) => {
293
			const title = results.length > 1 ? localize('fetchWebPage.fetchedFrom', 'Fetched from {0}', urls[i]) : undefined;
294
			if (!value) {
295
				return {
296
					kind: 'text',
297
					title,
298
					value: localize('fetchWebPage.invalidUrl', 'Invalid URL')
299
				};
300
			} else if (typeof value === 'string') {
301
				return {
302
					kind: 'text',
303
					title,
304
					value: value
305
				};
306
			} else if (value.type === 'tooldata') {
307
				return { ...value.value, title };
308
			} else if (value.type === 'extracted') {
309
				switch (value.value.status) {
310
					case 'ok':
311
						return { kind: 'text', title, value: value.value.result };
312
					case 'redirect':
313
						return { kind: 'text', title, value: `The webpage has redirected to "${value.value.toURI.toString(true)}". Use the ${InternalFetchWebPageToolId} again to get its contents.` };
314
					case 'error':
315
						return { kind: 'text', title, value: `An error occurred retrieving the fetch result: ${value.value.error}` };
316
					default:
317
						assertNever(value.value);
318
				}
319
			} else {
320
				throw new Error('unreachable');
321
			}
322
		});
323
	}
324

325
	private _getSupportedImageMimeType(uri: URI): ChatImageMimeType | undefined {
326
		const ext = extname(uri.path).toLowerCase();
327
		switch (ext) {
328
			case '.png':
329
				return ChatImageMimeType.PNG;
330
			case '.jpg':
331
			case '.jpeg':
332
				return ChatImageMimeType.JPEG;
333
			case '.gif':
334
				return ChatImageMimeType.GIF;
335
			case '.webp':
336
				return ChatImageMimeType.WEBP;
337
			case '.bmp':
338
				return ChatImageMimeType.BMP;
339
			default:
340
				return undefined;
341
		}
342
	}
343
}
344

345
Product

Resources

Company