Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/workbench/contrib/chat/electron-browser/builtInTools/fetchPageTool.ts
5263 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { assertNever } from '../../../../../base/common/assert.js';
7
import { CancellationToken } from '../../../../../base/common/cancellation.js';
8
import { MarkdownString } from '../../../../../base/common/htmlContent.js';
9
import { Iterable } from '../../../../../base/common/iterator.js';
10
import { ResourceSet } from '../../../../../base/common/map.js';
11
import { extname } from '../../../../../base/common/path.js';
12
import { URI } from '../../../../../base/common/uri.js';
13
import { localize } from '../../../../../nls.js';
14
import { IFileService } from '../../../../../platform/files/common/files.js';
15
import { IWebContentExtractorService, WebContentExtractResult } from '../../../../../platform/webContentExtractor/common/webContentExtractor.js';
16
import { detectEncodingFromBuffer } from '../../../../services/textfile/common/encoding.js';
17
import { ITrustedDomainService } from '../../../url/browser/trustedDomainService.js';
18
import { IChatService } from '../../common/chatService/chatService.js';
19
import { ChatImageMimeType } from '../../common/languageModels.js';
20
import { CountTokensCallback, IPreparedToolInvocation, IToolData, IToolImpl, IToolInvocation, IToolInvocationPreparationContext, IToolResult, IToolResultDataPart, IToolResultTextPart, ToolDataSource, ToolProgress } from '../../common/tools/languageModelToolsService.js';
21
import { InternalFetchWebPageToolId } from '../../common/tools/builtinTools/tools.js';
22
23
export const FetchWebPageToolData: IToolData = {
24
id: InternalFetchWebPageToolId,
25
displayName: 'Fetch Web Page',
26
canBeReferencedInPrompt: false,
27
modelDescription: 'Fetches the main content from a web page. This tool is useful for summarizing or analyzing the content of a webpage.',
28
source: ToolDataSource.Internal,
29
canRequestPostApproval: true,
30
canRequestPreApproval: true,
31
inputSchema: {
32
type: 'object',
33
properties: {
34
urls: {
35
type: 'array',
36
items: {
37
type: 'string',
38
},
39
description: localize('fetchWebPage.urlsDescription', 'An array of URLs to fetch content from.')
40
}
41
},
42
required: ['urls']
43
}
44
};
45
46
export interface IFetchWebPageToolParams {
47
urls?: string[];
48
}
49
50
type ResultType = string | { type: 'tooldata'; value: IToolResultDataPart } | { type: 'extracted'; value: WebContentExtractResult } | undefined;
51
52
export class FetchWebPageTool implements IToolImpl {
53
54
constructor(
55
@IWebContentExtractorService private readonly _readerModeService: IWebContentExtractorService,
56
@IFileService private readonly _fileService: IFileService,
57
@ITrustedDomainService private readonly _trustedDomainService: ITrustedDomainService,
58
@IChatService private readonly _chatService: IChatService,
59
) { }
60
61
async invoke(invocation: IToolInvocation, _countTokens: CountTokensCallback, _progress: ToolProgress, token: CancellationToken): Promise<IToolResult> {
62
const urls = (invocation.parameters as IFetchWebPageToolParams).urls || [];
63
const { webUris, fileUris, invalidUris } = this._parseUris(urls);
64
const allValidUris = [...webUris.values(), ...fileUris.values()];
65
66
if (!allValidUris.length && invalidUris.size === 0) {
67
return {
68
content: [{ kind: 'text', value: localize('fetchWebPage.noValidUrls', 'No valid URLs provided.') }]
69
};
70
}
71
72
// Get contents from web URIs
73
let webContents: WebContentExtractResult[] = [];
74
if (webUris.size > 0) {
75
const trustedDomains = this._trustedDomainService.trustedDomains;
76
webContents = await this._readerModeService.extract([...webUris.values()], { trustedDomains });
77
}
78
79
// Get contents from file URIs
80
const fileContents: (string | { type: 'tooldata'; value: IToolResultDataPart } | undefined)[] = [];
81
const successfulFileUris: URI[] = [];
82
for (const uri of fileUris.values()) {
83
try {
84
const fileContent = await this._fileService.readFile(uri, undefined, token);
85
86
// Check if this is a supported image type first
87
const imageMimeType = this._getSupportedImageMimeType(uri);
88
if (imageMimeType) {
89
// For supported image files, return as IToolResultDataPart
90
fileContents.push({
91
type: 'tooldata',
92
value: {
93
kind: 'data',
94
value: {
95
mimeType: imageMimeType,
96
data: fileContent.value
97
}
98
}
99
});
100
} else {
101
// Check if the content is binary
102
const detected = detectEncodingFromBuffer({ buffer: fileContent.value, bytesRead: fileContent.value.byteLength });
103
104
if (detected.seemsBinary) {
105
// For binary files, return a message indicating they're not supported
106
// We do this for now until the tools that leverage this internal tool can support binary content
107
fileContents.push(localize('fetchWebPage.binaryNotSupported', 'Binary files are not supported at the moment.'));
108
} else {
109
// For text files, convert to string
110
fileContents.push(fileContent.value.toString());
111
}
112
}
113
114
successfulFileUris.push(uri);
115
} catch (error) {
116
// If file service can't read it, treat as invalid
117
fileContents.push(undefined);
118
}
119
}
120
121
// Build results array in original order
122
const results: ResultType[] = [];
123
let webIndex = 0;
124
let fileIndex = 0;
125
for (const url of urls) {
126
if (invalidUris.has(url)) {
127
results.push(undefined);
128
} else if (webUris.has(url)) {
129
results.push({ type: 'extracted', value: webContents[webIndex] });
130
webIndex++;
131
} else if (fileUris.has(url)) {
132
results.push(fileContents[fileIndex]);
133
fileIndex++;
134
} else {
135
results.push(undefined);
136
}
137
}
138
139
// Skip confirming any results if every web content we got was an error or redirect
140
let confirmResults: undefined | boolean;
141
if (webContents.every(e => e.status === 'error' || e.status === 'redirect')) {
142
confirmResults = false;
143
}
144
145
146
// Only include URIs that actually had content successfully fetched
147
const actuallyValidUris = [...webUris.values(), ...successfulFileUris];
148
149
return {
150
content: this._getPromptPartsForResults(urls, results),
151
toolResultDetails: actuallyValidUris,
152
confirmResults,
153
};
154
}
155
156
async prepareToolInvocation(context: IToolInvocationPreparationContext, token: CancellationToken): Promise<IPreparedToolInvocation | undefined> {
157
const { webUris, fileUris, invalidUris } = this._parseUris(context.parameters.urls);
158
159
// Check which file URIs can actually be read
160
const validFileUris: URI[] = [];
161
const additionalInvalidUrls: string[] = [];
162
for (const [originalUrl, uri] of fileUris.entries()) {
163
try {
164
await this._fileService.stat(uri);
165
validFileUris.push(uri);
166
} catch (error) {
167
// If file service can't stat it, treat as invalid
168
additionalInvalidUrls.push(originalUrl);
169
}
170
}
171
172
const invalid = [...Array.from(invalidUris), ...additionalInvalidUrls];
173
const urlsNeedingConfirmation = new ResourceSet([...webUris.values(), ...validFileUris]);
174
175
const pastTenseMessage = invalid.length
176
? invalid.length > 1
177
// If there are multiple invalid URLs, show them all
178
? new MarkdownString(
179
localize(
180
'fetchWebPage.pastTenseMessage.plural',
181
'Fetched {0} resources, but the following were invalid URLs:\n\n{1}\n\n', urlsNeedingConfirmation.size, invalid.map(url => `- ${url}`).join('\n')
182
))
183
// If there is only one invalid URL, show it
184
: new MarkdownString(
185
localize(
186
'fetchWebPage.pastTenseMessage.singular',
187
'Fetched resource, but the following was an invalid URL:\n\n{0}\n\n', invalid[0]
188
))
189
// No invalid URLs
190
: new MarkdownString();
191
192
const invocationMessage = new MarkdownString();
193
if (urlsNeedingConfirmation.size > 1) {
194
pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.plural', 'Fetched {0} resources', urlsNeedingConfirmation.size));
195
invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.plural', 'Fetching {0} resources', urlsNeedingConfirmation.size));
196
} else if (urlsNeedingConfirmation.size === 1) {
197
const url = Iterable.first(urlsNeedingConfirmation)!.toString(true);
198
// If the URL is too long or it's a file url, show it as a link... otherwise, show it as plain text
199
if (url.length > 400 || validFileUris.length === 1) {
200
pastTenseMessage.appendMarkdown(localize({
201
key: 'fetchWebPage.pastTenseMessageResult.singularAsLink',
202
comment: [
203
// Make sure the link syntax is correct
204
'{Locked="]({0})"}',
205
]
206
}, 'Fetched [resource]({0})', url));
207
invocationMessage.appendMarkdown(localize({
208
key: 'fetchWebPage.invocationMessage.singularAsLink',
209
comment: [
210
// Make sure the link syntax is correct
211
'{Locked="]({0})"}',
212
]
213
}, 'Fetching [resource]({0})', url));
214
} else {
215
pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.singular', 'Fetched {0}', url));
216
invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.singular', 'Fetching {0}', url));
217
}
218
}
219
220
let confirmationNotNeededReason: string | undefined;
221
if (context.chatSessionResource) {
222
const model = this._chatService.getSession(context.chatSessionResource);
223
const userMessages = model?.getRequests().map(r => r.message.text.toLowerCase());
224
let urlsMentionedInPrompt = false;
225
for (const uri of urlsNeedingConfirmation) {
226
// Normalize to lowercase and remove any trailing slash
227
const toToCheck = uri.toString(true).toLowerCase().replace(/\/$/, '');
228
if (userMessages?.some(m => m.includes(toToCheck))) {
229
urlsNeedingConfirmation.delete(uri);
230
urlsMentionedInPrompt = true;
231
}
232
}
233
if (urlsMentionedInPrompt && urlsNeedingConfirmation.size === 0) {
234
confirmationNotNeededReason = localize('fetchWebPage.urlMentionedInPrompt', 'Auto approved because URL was in prompt');
235
}
236
}
237
238
const result: IPreparedToolInvocation = { invocationMessage, pastTenseMessage };
239
const allDomainsTrusted = Iterable.every(urlsNeedingConfirmation, u => this._trustedDomainService.isValid(u));
240
let confirmationTitle: string | undefined;
241
let confirmationMessage: string | MarkdownString | undefined;
242
243
if (urlsNeedingConfirmation.size && !allDomainsTrusted) {
244
if (urlsNeedingConfirmation.size === 1) {
245
confirmationTitle = localize('fetchWebPage.confirmationTitle.singular', 'Fetch web page?');
246
confirmationMessage = new MarkdownString(
247
Iterable.first(urlsNeedingConfirmation)!.toString(true),
248
{ supportThemeIcons: true }
249
);
250
} else {
251
confirmationTitle = localize('fetchWebPage.confirmationTitle.plural', 'Fetch web pages?');
252
confirmationMessage = new MarkdownString(
253
[...urlsNeedingConfirmation].map(uri => `- ${uri.toString(true)}`).join('\n'),
254
{ supportThemeIcons: true }
255
);
256
}
257
}
258
result.confirmationMessages = {
259
title: confirmationTitle,
260
message: confirmationMessage,
261
confirmResults: urlsNeedingConfirmation.size > 0,
262
allowAutoConfirm: true,
263
disclaimer: new MarkdownString('$(info) ' + localize('fetchWebPage.confirmationMessage.plural', 'Web content may contain malicious code or attempt prompt injection attacks.'), { supportThemeIcons: true }),
264
confirmationNotNeededReason
265
};
266
return result;
267
}
268
269
private _parseUris(urls?: string[]): { webUris: Map<string, URI>; fileUris: Map<string, URI>; invalidUris: Set<string> } {
270
const webUris = new Map<string, URI>();
271
const fileUris = new Map<string, URI>();
272
const invalidUris = new Set<string>();
273
274
urls?.forEach(url => {
275
try {
276
const uriObj = URI.parse(url);
277
if (uriObj.scheme === 'http' || uriObj.scheme === 'https') {
278
webUris.set(url, uriObj);
279
} else {
280
// Try to handle other schemes via file service
281
fileUris.set(url, uriObj);
282
}
283
} catch (e) {
284
invalidUris.add(url);
285
}
286
});
287
288
return { webUris, fileUris, invalidUris };
289
}
290
291
private _getPromptPartsForResults(urls: string[], results: ResultType[]): (IToolResultTextPart | IToolResultDataPart)[] {
292
return results.map((value, i) => {
293
const title = results.length > 1 ? localize('fetchWebPage.fetchedFrom', 'Fetched from {0}', urls[i]) : undefined;
294
if (!value) {
295
return {
296
kind: 'text',
297
title,
298
value: localize('fetchWebPage.invalidUrl', 'Invalid URL')
299
};
300
} else if (typeof value === 'string') {
301
return {
302
kind: 'text',
303
title,
304
value: value
305
};
306
} else if (value.type === 'tooldata') {
307
return { ...value.value, title };
308
} else if (value.type === 'extracted') {
309
switch (value.value.status) {
310
case 'ok':
311
return { kind: 'text', title, value: value.value.result };
312
case 'redirect':
313
return { kind: 'text', title, value: `The webpage has redirected to "${value.value.toURI.toString(true)}". Use the ${InternalFetchWebPageToolId} again to get its contents.` };
314
case 'error':
315
return { kind: 'text', title, value: `An error occurred retrieving the fetch result: ${value.value.error}` };
316
default:
317
assertNever(value.value);
318
}
319
} else {
320
throw new Error('unreachable');
321
}
322
});
323
}
324
325
private _getSupportedImageMimeType(uri: URI): ChatImageMimeType | undefined {
326
const ext = extname(uri.path).toLowerCase();
327
switch (ext) {
328
case '.png':
329
return ChatImageMimeType.PNG;
330
case '.jpg':
331
case '.jpeg':
332
return ChatImageMimeType.JPEG;
333
case '.gif':
334
return ChatImageMimeType.GIF;
335
case '.webp':
336
return ChatImageMimeType.WEBP;
337
case '.bmp':
338
return ChatImageMimeType.BMP;
339
default:
340
return undefined;
341
}
342
}
343
}
344
345