Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/workbench/contrib/chat/electron-browser/builtInTools/fetchPageTool.ts
4780 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { assertNever } from '../../../../../base/common/assert.js';
7
import { CancellationToken } from '../../../../../base/common/cancellation.js';
8
import { MarkdownString } from '../../../../../base/common/htmlContent.js';
9
import { Iterable } from '../../../../../base/common/iterator.js';
10
import { ResourceSet } from '../../../../../base/common/map.js';
11
import { extname } from '../../../../../base/common/path.js';
12
import { URI } from '../../../../../base/common/uri.js';
13
import { localize } from '../../../../../nls.js';
14
import { IFileService } from '../../../../../platform/files/common/files.js';
15
import { IWebContentExtractorService, WebContentExtractResult } from '../../../../../platform/webContentExtractor/common/webContentExtractor.js';
16
import { detectEncodingFromBuffer } from '../../../../services/textfile/common/encoding.js';
17
import { ITrustedDomainService } from '../../../url/browser/trustedDomainService.js';
18
import { IChatService } from '../../common/chatService/chatService.js';
19
import { LocalChatSessionUri } from '../../common/model/chatUri.js';
20
import { ChatImageMimeType } from '../../common/languageModels.js';
21
import { CountTokensCallback, IPreparedToolInvocation, IToolData, IToolImpl, IToolInvocation, IToolInvocationPreparationContext, IToolResult, IToolResultDataPart, IToolResultTextPart, ToolDataSource, ToolProgress } from '../../common/tools/languageModelToolsService.js';
22
import { InternalFetchWebPageToolId } from '../../common/tools/builtinTools/tools.js';
23
24
export const FetchWebPageToolData: IToolData = {
25
id: InternalFetchWebPageToolId,
26
displayName: 'Fetch Web Page',
27
canBeReferencedInPrompt: false,
28
modelDescription: 'Fetches the main content from a web page. This tool is useful for summarizing or analyzing the content of a webpage.',
29
source: ToolDataSource.Internal,
30
canRequestPostApproval: true,
31
canRequestPreApproval: true,
32
inputSchema: {
33
type: 'object',
34
properties: {
35
urls: {
36
type: 'array',
37
items: {
38
type: 'string',
39
},
40
description: localize('fetchWebPage.urlsDescription', 'An array of URLs to fetch content from.')
41
}
42
},
43
required: ['urls']
44
}
45
};
46
47
export interface IFetchWebPageToolParams {
48
urls?: string[];
49
}
50
51
type ResultType = string | { type: 'tooldata'; value: IToolResultDataPart } | { type: 'extracted'; value: WebContentExtractResult } | undefined;
52
53
export class FetchWebPageTool implements IToolImpl {
54
55
constructor(
56
@IWebContentExtractorService private readonly _readerModeService: IWebContentExtractorService,
57
@IFileService private readonly _fileService: IFileService,
58
@ITrustedDomainService private readonly _trustedDomainService: ITrustedDomainService,
59
@IChatService private readonly _chatService: IChatService,
60
) { }
61
62
async invoke(invocation: IToolInvocation, _countTokens: CountTokensCallback, _progress: ToolProgress, token: CancellationToken): Promise<IToolResult> {
63
const urls = (invocation.parameters as IFetchWebPageToolParams).urls || [];
64
const { webUris, fileUris, invalidUris } = this._parseUris(urls);
65
const allValidUris = [...webUris.values(), ...fileUris.values()];
66
67
if (!allValidUris.length && invalidUris.size === 0) {
68
return {
69
content: [{ kind: 'text', value: localize('fetchWebPage.noValidUrls', 'No valid URLs provided.') }]
70
};
71
}
72
73
// Get contents from web URIs
74
let webContents: WebContentExtractResult[] = [];
75
if (webUris.size > 0) {
76
const trustedDomains = this._trustedDomainService.trustedDomains;
77
webContents = await this._readerModeService.extract([...webUris.values()], { trustedDomains });
78
}
79
80
// Get contents from file URIs
81
const fileContents: (string | { type: 'tooldata'; value: IToolResultDataPart } | undefined)[] = [];
82
const successfulFileUris: URI[] = [];
83
for (const uri of fileUris.values()) {
84
try {
85
const fileContent = await this._fileService.readFile(uri, undefined, token);
86
87
// Check if this is a supported image type first
88
const imageMimeType = this._getSupportedImageMimeType(uri);
89
if (imageMimeType) {
90
// For supported image files, return as IToolResultDataPart
91
fileContents.push({
92
type: 'tooldata',
93
value: {
94
kind: 'data',
95
value: {
96
mimeType: imageMimeType,
97
data: fileContent.value
98
}
99
}
100
});
101
} else {
102
// Check if the content is binary
103
const detected = detectEncodingFromBuffer({ buffer: fileContent.value, bytesRead: fileContent.value.byteLength });
104
105
if (detected.seemsBinary) {
106
// For binary files, return a message indicating they're not supported
107
// We do this for now until the tools that leverage this internal tool can support binary content
108
fileContents.push(localize('fetchWebPage.binaryNotSupported', 'Binary files are not supported at the moment.'));
109
} else {
110
// For text files, convert to string
111
fileContents.push(fileContent.value.toString());
112
}
113
}
114
115
successfulFileUris.push(uri);
116
} catch (error) {
117
// If file service can't read it, treat as invalid
118
fileContents.push(undefined);
119
}
120
}
121
122
// Build results array in original order
123
const results: ResultType[] = [];
124
let webIndex = 0;
125
let fileIndex = 0;
126
for (const url of urls) {
127
if (invalidUris.has(url)) {
128
results.push(undefined);
129
} else if (webUris.has(url)) {
130
results.push({ type: 'extracted', value: webContents[webIndex] });
131
webIndex++;
132
} else if (fileUris.has(url)) {
133
results.push(fileContents[fileIndex]);
134
fileIndex++;
135
} else {
136
results.push(undefined);
137
}
138
}
139
140
// Skip confirming any results if every web content we got was an error or redirect
141
let confirmResults: undefined | boolean;
142
if (webContents.every(e => e.status === 'error' || e.status === 'redirect')) {
143
confirmResults = false;
144
}
145
146
147
// Only include URIs that actually had content successfully fetched
148
const actuallyValidUris = [...webUris.values(), ...successfulFileUris];
149
150
return {
151
content: this._getPromptPartsForResults(urls, results),
152
toolResultDetails: actuallyValidUris,
153
confirmResults,
154
};
155
}
156
157
async prepareToolInvocation(context: IToolInvocationPreparationContext, token: CancellationToken): Promise<IPreparedToolInvocation | undefined> {
158
const { webUris, fileUris, invalidUris } = this._parseUris(context.parameters.urls);
159
160
// Check which file URIs can actually be read
161
const validFileUris: URI[] = [];
162
const additionalInvalidUrls: string[] = [];
163
for (const [originalUrl, uri] of fileUris.entries()) {
164
try {
165
await this._fileService.stat(uri);
166
validFileUris.push(uri);
167
} catch (error) {
168
// If file service can't stat it, treat as invalid
169
additionalInvalidUrls.push(originalUrl);
170
}
171
}
172
173
const invalid = [...Array.from(invalidUris), ...additionalInvalidUrls];
174
const urlsNeedingConfirmation = new ResourceSet([...webUris.values(), ...validFileUris]);
175
176
const pastTenseMessage = invalid.length
177
? invalid.length > 1
178
// If there are multiple invalid URLs, show them all
179
? new MarkdownString(
180
localize(
181
'fetchWebPage.pastTenseMessage.plural',
182
'Fetched {0} resources, but the following were invalid URLs:\n\n{1}\n\n', urlsNeedingConfirmation.size, invalid.map(url => `- ${url}`).join('\n')
183
))
184
// If there is only one invalid URL, show it
185
: new MarkdownString(
186
localize(
187
'fetchWebPage.pastTenseMessage.singular',
188
'Fetched resource, but the following was an invalid URL:\n\n{0}\n\n', invalid[0]
189
))
190
// No invalid URLs
191
: new MarkdownString();
192
193
const invocationMessage = new MarkdownString();
194
if (urlsNeedingConfirmation.size > 1) {
195
pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.plural', 'Fetched {0} resources', urlsNeedingConfirmation.size));
196
invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.plural', 'Fetching {0} resources', urlsNeedingConfirmation.size));
197
} else if (urlsNeedingConfirmation.size === 1) {
198
const url = Iterable.first(urlsNeedingConfirmation)!.toString(true);
199
// If the URL is too long or it's a file url, show it as a link... otherwise, show it as plain text
200
if (url.length > 400 || validFileUris.length === 1) {
201
pastTenseMessage.appendMarkdown(localize({
202
key: 'fetchWebPage.pastTenseMessageResult.singularAsLink',
203
comment: [
204
// Make sure the link syntax is correct
205
'{Locked="]({0})"}',
206
]
207
}, 'Fetched [resource]({0})', url));
208
invocationMessage.appendMarkdown(localize({
209
key: 'fetchWebPage.invocationMessage.singularAsLink',
210
comment: [
211
// Make sure the link syntax is correct
212
'{Locked="]({0})"}',
213
]
214
}, 'Fetching [resource]({0})', url));
215
} else {
216
pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.singular', 'Fetched {0}', url));
217
invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.singular', 'Fetching {0}', url));
218
}
219
}
220
221
let confirmationNotNeededReason: string | undefined;
222
if (context.chatSessionId) {
223
const model = this._chatService.getSession(LocalChatSessionUri.forSession(context.chatSessionId));
224
const userMessages = model?.getRequests().map(r => r.message.text.toLowerCase());
225
let urlsMentionedInPrompt = false;
226
for (const uri of urlsNeedingConfirmation) {
227
// Normalize to lowercase and remove any trailing slash
228
const toToCheck = uri.toString(true).toLowerCase().replace(/\/$/, '');
229
if (userMessages?.some(m => m.includes(toToCheck))) {
230
urlsNeedingConfirmation.delete(uri);
231
urlsMentionedInPrompt = true;
232
}
233
}
234
if (urlsMentionedInPrompt && urlsNeedingConfirmation.size === 0) {
235
confirmationNotNeededReason = localize('fetchWebPage.urlMentionedInPrompt', 'Auto approved because URL was in prompt');
236
}
237
}
238
239
const result: IPreparedToolInvocation = { invocationMessage, pastTenseMessage };
240
const allDomainsTrusted = Iterable.every(urlsNeedingConfirmation, u => this._trustedDomainService.isValid(u));
241
let confirmationTitle: string | undefined;
242
let confirmationMessage: string | MarkdownString | undefined;
243
244
if (urlsNeedingConfirmation.size && !allDomainsTrusted) {
245
if (urlsNeedingConfirmation.size === 1) {
246
confirmationTitle = localize('fetchWebPage.confirmationTitle.singular', 'Fetch web page?');
247
confirmationMessage = new MarkdownString(
248
Iterable.first(urlsNeedingConfirmation)!.toString(true),
249
{ supportThemeIcons: true }
250
);
251
} else {
252
confirmationTitle = localize('fetchWebPage.confirmationTitle.plural', 'Fetch web pages?');
253
confirmationMessage = new MarkdownString(
254
[...urlsNeedingConfirmation].map(uri => `- ${uri.toString(true)}`).join('\n'),
255
{ supportThemeIcons: true }
256
);
257
}
258
}
259
result.confirmationMessages = {
260
title: confirmationTitle,
261
message: confirmationMessage,
262
confirmResults: urlsNeedingConfirmation.size > 0,
263
allowAutoConfirm: true,
264
disclaimer: new MarkdownString('$(info) ' + localize('fetchWebPage.confirmationMessage.plural', 'Web content may contain malicious code or attempt prompt injection attacks.'), { supportThemeIcons: true }),
265
confirmationNotNeededReason
266
};
267
return result;
268
}
269
270
private _parseUris(urls?: string[]): { webUris: Map<string, URI>; fileUris: Map<string, URI>; invalidUris: Set<string> } {
271
const webUris = new Map<string, URI>();
272
const fileUris = new Map<string, URI>();
273
const invalidUris = new Set<string>();
274
275
urls?.forEach(url => {
276
try {
277
const uriObj = URI.parse(url);
278
if (uriObj.scheme === 'http' || uriObj.scheme === 'https') {
279
webUris.set(url, uriObj);
280
} else {
281
// Try to handle other schemes via file service
282
fileUris.set(url, uriObj);
283
}
284
} catch (e) {
285
invalidUris.add(url);
286
}
287
});
288
289
return { webUris, fileUris, invalidUris };
290
}
291
292
private _getPromptPartsForResults(urls: string[], results: ResultType[]): (IToolResultTextPart | IToolResultDataPart)[] {
293
return results.map((value, i) => {
294
const title = results.length > 1 ? localize('fetchWebPage.fetchedFrom', 'Fetched from {0}', urls[i]) : undefined;
295
if (!value) {
296
return {
297
kind: 'text',
298
title,
299
value: localize('fetchWebPage.invalidUrl', 'Invalid URL')
300
};
301
} else if (typeof value === 'string') {
302
return {
303
kind: 'text',
304
title,
305
value: value
306
};
307
} else if (value.type === 'tooldata') {
308
return { ...value.value, title };
309
} else if (value.type === 'extracted') {
310
switch (value.value.status) {
311
case 'ok':
312
return { kind: 'text', title, value: value.value.result };
313
case 'redirect':
314
return { kind: 'text', title, value: `The webpage has redirected to "${value.value.toURI.toString(true)}". Use the ${InternalFetchWebPageToolId} again to get its contents.` };
315
case 'error':
316
return { kind: 'text', title, value: `An error occurred retrieving the fetch result: ${value.value.error}` };
317
default:
318
assertNever(value.value);
319
}
320
} else {
321
throw new Error('unreachable');
322
}
323
});
324
}
325
326
private _getSupportedImageMimeType(uri: URI): ChatImageMimeType | undefined {
327
const ext = extname(uri.path).toLowerCase();
328
switch (ext) {
329
case '.png':
330
return ChatImageMimeType.PNG;
331
case '.jpg':
332
case '.jpeg':
333
return ChatImageMimeType.JPEG;
334
case '.gif':
335
return ChatImageMimeType.GIF;
336
case '.webp':
337
return ChatImageMimeType.WEBP;
338
case '.bmp':
339
return ChatImageMimeType.BMP;
340
default:
341
return undefined;
342
}
343
}
344
}
345
346