Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/workbench/contrib/chat/electron-browser/tools/fetchPageTool.ts
3244 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
import { CancellationToken } from '../../../../../base/common/cancellation.js';
7
import { MarkdownString } from '../../../../../base/common/htmlContent.js';
8
import { ResourceSet } from '../../../../../base/common/map.js';
9
import { extname } from '../../../../../base/common/path.js';
10
import { URI } from '../../../../../base/common/uri.js';
11
import { localize } from '../../../../../nls.js';
12
import { IFileService } from '../../../../../platform/files/common/files.js';
13
import { IWebContentExtractorService } from '../../../../../platform/webContentExtractor/common/webContentExtractor.js';
14
import { detectEncodingFromBuffer } from '../../../../services/textfile/common/encoding.js';
15
import { ChatImageMimeType } from '../../common/languageModels.js';
16
import { CountTokensCallback, IPreparedToolInvocation, IToolData, IToolImpl, IToolInvocation, IToolInvocationPreparationContext, IToolResult, IToolResultDataPart, IToolResultTextPart, ToolDataSource, ToolProgress } from '../../common/languageModelToolsService.js';
17
import { InternalFetchWebPageToolId } from '../../common/tools/tools.js';
18
19
export const FetchWebPageToolData: IToolData = {
20
id: InternalFetchWebPageToolId,
21
displayName: 'Fetch Web Page',
22
canBeReferencedInPrompt: false,
23
modelDescription: localize('fetchWebPage.modelDescription', 'Fetches the main content from a web page. This tool is useful for summarizing or analyzing the content of a webpage.'),
24
source: ToolDataSource.Internal,
25
inputSchema: {
26
type: 'object',
27
properties: {
28
urls: {
29
type: 'array',
30
items: {
31
type: 'string',
32
},
33
description: localize('fetchWebPage.urlsDescription', 'An array of URLs to fetch content from.')
34
}
35
},
36
required: ['urls']
37
}
38
};
39
40
export class FetchWebPageTool implements IToolImpl {
41
private _alreadyApprovedDomains = new ResourceSet();
42
43
constructor(
44
@IWebContentExtractorService private readonly _readerModeService: IWebContentExtractorService,
45
@IFileService private readonly _fileService: IFileService,
46
) { }
47
48
async invoke(invocation: IToolInvocation, _countTokens: CountTokensCallback, _progress: ToolProgress, token: CancellationToken): Promise<IToolResult> {
49
const urls = (invocation.parameters as { urls?: string[] }).urls || [];
50
const { webUris, fileUris, invalidUris } = this._parseUris(urls);
51
const allValidUris = [...webUris.values(), ...fileUris.values()];
52
53
if (!allValidUris.length && invalidUris.size === 0) {
54
return {
55
content: [{ kind: 'text', value: localize('fetchWebPage.noValidUrls', 'No valid URLs provided.') }]
56
};
57
}
58
59
// We approved these via confirmation, so mark them as "approved" in this session
60
// if they are not approved via the trusted domain service.
61
for (const uri of webUris.values()) {
62
this._alreadyApprovedDomains.add(uri);
63
}
64
65
// Get contents from web URIs
66
const webContents = webUris.size > 0 ? await this._readerModeService.extract([...webUris.values()]) : [];
67
68
// Get contents from file URIs
69
const fileContents: (string | IToolResultDataPart | undefined)[] = [];
70
const successfulFileUris: URI[] = [];
71
for (const uri of fileUris.values()) {
72
try {
73
const fileContent = await this._fileService.readFile(uri, undefined, token);
74
75
// Check if this is a supported image type first
76
const imageMimeType = this._getSupportedImageMimeType(uri);
77
if (imageMimeType) {
78
// For supported image files, return as IToolResultDataPart
79
fileContents.push({
80
kind: 'data',
81
value: {
82
mimeType: imageMimeType,
83
data: fileContent.value
84
}
85
});
86
} else {
87
// Check if the content is binary
88
const detected = detectEncodingFromBuffer({ buffer: fileContent.value, bytesRead: fileContent.value.byteLength });
89
90
if (detected.seemsBinary) {
91
// For binary files, return a message indicating they're not supported
92
// We do this for now until the tools that leverage this internal tool can support binary content
93
fileContents.push(localize('fetchWebPage.binaryNotSupported', 'Binary files are not supported at the moment.'));
94
} else {
95
// For text files, convert to string
96
fileContents.push(fileContent.value.toString());
97
}
98
}
99
100
successfulFileUris.push(uri);
101
} catch (error) {
102
// If file service can't read it, treat as invalid
103
fileContents.push(undefined);
104
}
105
}
106
107
// Build results array in original order
108
const results: (string | IToolResultDataPart | undefined)[] = [];
109
let webIndex = 0;
110
let fileIndex = 0;
111
for (const url of urls) {
112
if (invalidUris.has(url)) {
113
results.push(undefined);
114
} else if (webUris.has(url)) {
115
results.push(webContents[webIndex]);
116
webIndex++;
117
} else if (fileUris.has(url)) {
118
results.push(fileContents[fileIndex]);
119
fileIndex++;
120
} else {
121
results.push(undefined);
122
}
123
}
124
125
// Only include URIs that actually had content successfully fetched
126
const actuallyValidUris = [...webUris.values(), ...successfulFileUris];
127
128
return {
129
content: this._getPromptPartsForResults(results),
130
toolResultDetails: actuallyValidUris
131
};
132
}
133
134
async prepareToolInvocation(context: IToolInvocationPreparationContext, token: CancellationToken): Promise<IPreparedToolInvocation | undefined> {
135
const { webUris, fileUris, invalidUris } = this._parseUris(context.parameters.urls);
136
137
// Check which file URIs can actually be read
138
const validFileUris: URI[] = [];
139
const additionalInvalidUrls: string[] = [];
140
for (const [originalUrl, uri] of fileUris.entries()) {
141
try {
142
await this._fileService.stat(uri);
143
validFileUris.push(uri);
144
} catch (error) {
145
// If file service can't stat it, treat as invalid
146
additionalInvalidUrls.push(originalUrl);
147
}
148
}
149
150
const invalid = [...Array.from(invalidUris), ...additionalInvalidUrls];
151
const valid = [...webUris.values(), ...validFileUris];
152
const urlsNeedingConfirmation = valid.length > 0 ? valid.filter(url => !this._alreadyApprovedDomains.has(url)) : [];
153
154
const pastTenseMessage = invalid.length
155
? invalid.length > 1
156
// If there are multiple invalid URLs, show them all
157
? new MarkdownString(
158
localize(
159
'fetchWebPage.pastTenseMessage.plural',
160
'Fetched {0} resources, but the following were invalid URLs:\n\n{1}\n\n', valid.length, invalid.map(url => `- ${url}`).join('\n')
161
))
162
// If there is only one invalid URL, show it
163
: new MarkdownString(
164
localize(
165
'fetchWebPage.pastTenseMessage.singular',
166
'Fetched resource, but the following was an invalid URL:\n\n{0}\n\n', invalid[0]
167
))
168
// No invalid URLs
169
: new MarkdownString();
170
171
const invocationMessage = new MarkdownString();
172
if (valid.length > 1) {
173
pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.plural', 'Fetched {0} resources', valid.length));
174
invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.plural', 'Fetching {0} resources', valid.length));
175
} else if (valid.length === 1) {
176
const url = valid[0].toString();
177
// If the URL is too long or it's a file url, show it as a link... otherwise, show it as plain text
178
if (url.length > 400 || validFileUris.length === 1) {
179
pastTenseMessage.appendMarkdown(localize({
180
key: 'fetchWebPage.pastTenseMessageResult.singularAsLink',
181
comment: [
182
// Make sure the link syntax is correct
183
'{Locked="]({0})"}',
184
]
185
}, 'Fetched [resource]({0})', url));
186
invocationMessage.appendMarkdown(localize({
187
key: 'fetchWebPage.invocationMessage.singularAsLink',
188
comment: [
189
// Make sure the link syntax is correct
190
'{Locked="]({0})"}',
191
]
192
}, 'Fetching [resource]({0})', url));
193
} else {
194
pastTenseMessage.appendMarkdown(localize('fetchWebPage.pastTenseMessageResult.singular', 'Fetched {0}', url));
195
invocationMessage.appendMarkdown(localize('fetchWebPage.invocationMessage.singular', 'Fetching {0}', url));
196
}
197
}
198
199
const result: IPreparedToolInvocation = { invocationMessage, pastTenseMessage };
200
if (urlsNeedingConfirmation.length) {
201
let confirmationTitle: string;
202
let confirmationMessage: string | MarkdownString;
203
if (urlsNeedingConfirmation.length === 1) {
204
confirmationTitle = localize('fetchWebPage.confirmationTitle.singular', 'Fetch web page?');
205
confirmationMessage = new MarkdownString(
206
urlsNeedingConfirmation[0].toString(),
207
{ supportThemeIcons: true }
208
);
209
} else {
210
confirmationTitle = localize('fetchWebPage.confirmationTitle.plural', 'Fetch web pages?');
211
confirmationMessage = new MarkdownString(
212
urlsNeedingConfirmation.map(uri => `- ${uri.toString()}`).join('\n'),
213
{ supportThemeIcons: true }
214
);
215
}
216
result.confirmationMessages = {
217
title: confirmationTitle,
218
message: confirmationMessage,
219
allowAutoConfirm: true,
220
disclaimer: new MarkdownString('$(info) ' + localize('fetchWebPage.confirmationMessage.plural', 'Web content may contain malicious code or attempt prompt injection attacks.'), { supportThemeIcons: true })
221
};
222
}
223
return result;
224
}
225
226
private _parseUris(urls?: string[]): { webUris: Map<string, URI>; fileUris: Map<string, URI>; invalidUris: Set<string> } {
227
const webUris = new Map<string, URI>();
228
const fileUris = new Map<string, URI>();
229
const invalidUris = new Set<string>();
230
231
urls?.forEach(url => {
232
try {
233
const uriObj = URI.parse(url);
234
if (uriObj.scheme === 'http' || uriObj.scheme === 'https') {
235
webUris.set(url, uriObj);
236
} else {
237
// Try to handle other schemes via file service
238
fileUris.set(url, uriObj);
239
}
240
} catch (e) {
241
invalidUris.add(url);
242
}
243
});
244
245
return { webUris, fileUris, invalidUris };
246
}
247
248
private _getPromptPartsForResults(results: (string | IToolResultDataPart | undefined)[]): (IToolResultTextPart | IToolResultDataPart)[] {
249
return results.map(value => {
250
if (!value) {
251
return {
252
kind: 'text',
253
value: localize('fetchWebPage.invalidUrl', 'Invalid URL')
254
};
255
} else if (typeof value === 'string') {
256
return {
257
kind: 'text',
258
value: value
259
};
260
} else {
261
// This is an IToolResultDataPart
262
return value;
263
}
264
});
265
}
266
267
private _getSupportedImageMimeType(uri: URI): ChatImageMimeType | undefined {
268
const ext = extname(uri.path).toLowerCase();
269
switch (ext) {
270
case '.png':
271
return ChatImageMimeType.PNG;
272
case '.jpg':
273
case '.jpeg':
274
return ChatImageMimeType.JPEG;
275
case '.gif':
276
return ChatImageMimeType.GIF;
277
case '.webp':
278
return ChatImageMimeType.WEBP;
279
case '.bmp':
280
return ChatImageMimeType.BMP;
281
default:
282
return undefined;
283
}
284
}
285
}
286
287