Path: blob/main/extensions/copilot/src/platform/ignore/node/remoteContentExclusion.ts
13401 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45import { RequestType } from '@vscode/copilot-api';6import { minimatch } from 'minimatch';7import { createSha256Hash } from '../../../util/common/crypto';8import { coalesce } from '../../../util/vs/base/common/arrays';9import { Limiter, raceCancellationError } from '../../../util/vs/base/common/async';10import { CancellationToken } from '../../../util/vs/base/common/cancellation';11import { IDisposable } from '../../../util/vs/base/common/lifecycle';12import { ResourceMap } from '../../../util/vs/base/common/map';13import { URI } from '../../../util/vs/base/common/uri';14import { IAuthenticationService } from '../../authentication/common/authentication';15import { ICAPIClientService } from '../../endpoint/common/capiClient';16import { IFileSystemService } from '../../filesystem/common/fileSystemService';17import { readFileFromTextBufferOrFS } from '../../filesystem/node/fileSystemServiceImpl';18import { IGitService, RepoContext, normalizeFetchUrl } from '../../git/common/gitService';19import { ILogService } from '../../log/common/logService';20import { Response } from '../../networking/common/fetcherService';21import { IRequestLogger } from '../../requestLogger/common/requestLogger';22import { IWorkspaceService } from '../../workspace/common/workspaceService';2324type ContentExclusionRule = {25paths: string[];26ifNoneMatch?: string[];27ifAnyMatch?: string[];28source: { name: string; type: 'Repository' | 'Organization' };29};3031type ContentExclusionResponse = {32rules: ContentExclusionRule[];33last_updated_at: number;34};3536type RepoMetadata = { repoRootPath: string; fetchUrls: string[] };3738const NON_GIT_FILE_KEY = 'non-git-file';3940/**41* Fetches content exclusion policies from GH remotes42*/43export class RemoteContentExclusion implements IDisposable {44// The cache which maps remote fetch url to the minimatch patterns, order of patterns matters here45private _contentExclusionCache: Map<string, { patterns: string[]; ifAnyMatch: RegExp[]; ifNoneMatch: RegExp[] }> = new Map();46private _contentExclusionFetchPromise: Promise<void> | null = null;47// This caches the ignore results as they can be expensive to compute and a single render can request results 100s of times48private _ignoreGlobResultCache: ResourceMap<boolean> = new ResourceMap();49// Map of the hash of file contents to the result of the regex check50private _ignoreRegexResultCache: Map<string, boolean> = new Map();51private _lastRuleFetch = 0;52private _disposables: IDisposable[] = [];53private readonly _fileReadLimiter: Limiter<string | Uint8Array>;54// Cache of repository root paths to their metadata to avoid calling getRepositoryFetchUrls for every file55// This is critical for performance when there are many files in a workspace56private readonly _repoRootCache: Map<string, RepoMetadata> = new Map();5758constructor(59private readonly _gitService: IGitService,60private readonly _logService: ILogService,61private readonly _authService: IAuthenticationService,62private readonly _capiClientService: ICAPIClientService,63private readonly _fileSystemService: IFileSystemService,64private readonly _workspaceService: IWorkspaceService,65private readonly _requestLogger: IRequestLogger66) {67// This is a specialized entry to store the global rules that apply to files outside of any git repository68// The other option was to maintain a separate cache for non git files but that would be redundant69this._contentExclusionCache.set(NON_GIT_FILE_KEY, { patterns: [], ifAnyMatch: [], ifNoneMatch: [] });70this._disposables.push(this._gitService.onDidCloseRepository((r) => {71const repoInfo = this.getRepositoryInfo(r);72if (!repoInfo) {73return;74}75// Remove from repo root cache76this._repoRootCache.delete(repoInfo.repoRootPath);77for (const url of repoInfo.fetchUrls) {78this._contentExclusionCache.delete(url);79}80}));8182this._fileReadLimiter = new Limiter<string | Uint8Array>(10);83this._disposables.push(this._fileReadLimiter);84}8586public async isIgnored(file: URI, token: CancellationToken = CancellationToken.None): Promise<boolean> {87// 1. If glob is not ignored, but there is no regex we can return false as the URI will not change88// 2. If glob is not ignored, but there are regex we need to read file content which will happen lower in the regex code.89// 3. If glob is ignored, it will return true despite regex since the most restrictive exclusion takes the cake90if ((this._ignoreGlobResultCache.has(file) && !this.isRegexContextExclusionsEnabled) || this._ignoreGlobResultCache.get(file)) {91return this._ignoreGlobResultCache.get(file) ?? false;92}93// Any pending requests that may be in flight should be awaited before returning a result94if (this._contentExclusionFetchPromise) {95await raceCancellationError(this._contentExclusionFetchPromise, token);96}9798// Try to find the repository from the cache first to avoid expensive git extension calls99// This is critical for performance when there are many files in a workspace100let repoMetadata = this.findCachedRepoMetadataForFile(file);101102// If not in cache, query the git extension (this is expensive for many files)103if (!repoMetadata) {104const repo = await raceCancellationError(this._gitService.getRepositoryFetchUrls(file), token);105repoMetadata = this.getRepositoryInfo(repo);106// Cache the result for future lookups107if (repoMetadata) {108this._repoRootCache.set(repoMetadata.repoRootPath, repoMetadata);109}110}111112// No repository is associated with this file, so we set it to the 'virtual' non-git file repo / key113// This way when we go to lookup rules for this file it will pull the non git file rules114if (!repoMetadata) {115repoMetadata = { repoRootPath: '', fetchUrls: [NON_GIT_FILE_KEY] };116}117118const fileName = file.path.toLowerCase().replace(repoMetadata.repoRootPath.toLowerCase(), '');119120// We're missing entries for this repository in the cache, so we fetch it.121// Or it has been more than 30 minutes so the current rules are stale122if (this.shouldFetchContentExclusionRules(repoMetadata) || (Date.now() - this._lastRuleFetch > 30 * 60 * 1000)) {123this._logService.trace(`Fetching content exclusions, due to ${this.shouldFetchContentExclusionRules(repoMetadata) ? 'repository change' : 'stale cache'}.`);124this._lastRuleFetch = Date.now();125await raceCancellationError(this.makeContentExclusionRequest(), token);126}127128const minimatchConfig = {129nocase: true,130matchBase: true,131nonegate: true,132dot: true133};134135for (const { patterns } of this._contentExclusionCache.values()) {136for (const rule of patterns) {137const matchesPattern = minimatch(fileName, rule, minimatchConfig) || minimatch(file.path, rule, minimatchConfig);138if (matchesPattern) {139this._logService.debug(`File ${file.path} is ignored by content exclusion rule ${rule}`);140this._ignoreGlobResultCache.set(file, true);141return true;142}143}144}145let fileContents: string = '';146let fileContentHash: string = '';147for (const fetchUrl of repoMetadata.fetchUrls) {148const { ifAnyMatch, ifNoneMatch } = this._contentExclusionCache.get(fetchUrl) ?? { ifAnyMatch: [], ifNoneMatch: [] };149// We only want to read the file if we absolutely must as it can be expensive150if (ifAnyMatch.length > 0 || ifNoneMatch.length > 0) {151if (!fileContents) {152try {153// Read the file contents and hash it so we can cache the result - Only reads up to 1KB of the file, as reading too much can be expensive and regex exclusions are normally header based154// Note: This feature is internal only so we can adapt the implementation as needed without breaking clients.155const fileContentOrBuffer = await this._fileReadLimiter.queue(() => readFileFromTextBufferOrFS(this._fileSystemService, this._workspaceService, file, 1024));156fileContents = typeof fileContentOrBuffer === 'string' ? fileContentOrBuffer : new TextDecoder().decode(fileContentOrBuffer);157fileContentHash = await createSha256Hash(fileContents);158// Cache hit for these file contents, no need to run the regex patterns159if (this._ignoreRegexResultCache.has(fileContentHash)) {160return this._ignoreRegexResultCache.get(fileContentHash) ?? false;161}162} catch {163// We failed to read the file, so it should just be ignored as we have no idea what the contents are or if it exists164return true;165}166}167}168if (ifAnyMatch.length > 0 && fileContents && ifAnyMatch.some(pattern => pattern.test(fileContents))) {169this._logService.debug(`File ${file.path} is ignored by content exclusion rule ifAnyMatch`);170this._ignoreRegexResultCache.set(fileContentHash, true);171return true;172}173if (ifNoneMatch.length > 0 && fileContents && !ifNoneMatch.some(pattern => pattern.test(fileContents))) {174this._logService.debug(`File ${file.path} is ignored by content exclusion rule ifNoneMatch`);175this._ignoreRegexResultCache.set(fileContentHash, true);176return true;177}178}179180this._ignoreGlobResultCache.set(file, false);181this._ignoreRegexResultCache.set(fileContentHash, false);182return false;183}184185/**186* Returns whether or not there are regex context exclusions.187*/188public get isRegexContextExclusionsEnabled(): boolean {189return [...this._contentExclusionCache.values()].some(({ ifAnyMatch, ifNoneMatch }: { ifAnyMatch: RegExp[]; ifNoneMatch: RegExp[] }) => ifAnyMatch.length > 0 || ifNoneMatch.length > 0);190}191/**192* Loads the content exclusion rules for the given repositories. Primarily used to load a bunch of repos at once prior to a search for example.193* @param repoUris The list of repository URIs to load the content exclusion rules for194*/195public async loadRepos(repoUris: URI[]) {196const repos = await Promise.all(repoUris.map(uri => this._gitService.getRepositoryFetchUrls(uri)));197const repoInfos = repos.map(repo => {198const repoInfo = this.getRepositoryInfo(repo);199// Populate the repo root cache for future lookups200if (repoInfo) {201this._repoRootCache.set(repoInfo.repoRootPath, repoInfo);202}203return this.shouldFetchContentExclusionRules(repoInfo);204});205if (repoInfos.some(info => info)) {206this._lastRuleFetch = Date.now();207await this.makeContentExclusionRequest();208}209}210211public async asMinimatchPatterns() {212await this._contentExclusionFetchPromise;213const patterns: string[] = Array.from(this._contentExclusionCache.values()).flatMap(({ patterns }) => patterns);214return patterns;215}216217public dispose() {218this._disposables.forEach(d => d.dispose());219this._disposables = [];220this._contentExclusionCache.clear();221}222223private shouldFetchContentExclusionRules(repoInfo: RepoMetadata | undefined): boolean {224if (!repoInfo) {225return false;226}227let shouldFetch = false;228for (const remoteRepoUrl of repoInfo?.fetchUrls ?? []) {229if (!this._contentExclusionCache.has(remoteRepoUrl)) {230shouldFetch = true;231this._contentExclusionCache.set(remoteRepoUrl, { patterns: [], ifAnyMatch: [], ifNoneMatch: [] });232}233}234return shouldFetch;235}236237/**238* A wrapper around the actual request239* TODO @lramos15 add cancellation to cancel the old request in flight240* @returns The promise which resolves when the request is complete241*/242private async makeContentExclusionRequest(): Promise<void> {243if (this._contentExclusionFetchPromise) {244await this._contentExclusionFetchPromise;245}246try {247this._contentExclusionFetchPromise = this._contentExclusionRequest();248await this._contentExclusionFetchPromise;249this._contentExclusionFetchPromise = null;250} catch {251this._contentExclusionFetchPromise = null;252}253}254255256/**257* The actual function that fetches the content exclusion rules from the GH API.258* Not recommended to call directly and instead use {@link makeContentExclusionRequest} as that ensures only one call is pending at any time259*/260private async _contentExclusionRequest(): Promise<void> {261// Clear the result cache as new rules will come and therefore it is no longer valid262this._ignoreGlobResultCache.clear();263const startTime = Date.now();264const capiClientService = this._capiClientService;265const ghToken = (await this._authService.getGitHubSession('any', { silent: true }))?.accessToken;266const remoteFetchUrls = Array.from(this._contentExclusionCache.keys());267const updateRulesForRepos = async (reposToFetch: string[]) => {268269const response = await capiClientService.makeRequest<Response>({270headers: {271'Authorization': `token ${ghToken}`272},273}, { type: RequestType.ContentExclusion, repos: reposToFetch });274275if (!response.ok) {276this._logService.error(`Failed to fetch content exclusion rules: ${response?.statusText}`);277return;278}279const data: ContentExclusionResponse[] = await response.json();280for (let j = 0; j < data.length; j++) {281const patterns = data[j].rules.map(rule => rule.paths).flat();282const ifAnyMatch = coalesce(data[j].rules.map(rule => rule.ifAnyMatch).flat()).map(pattern => stringToRegex(pattern));283const ifNoneMatch = coalesce(data[j].rules.map(rule => rule.ifNoneMatch).flat()).map(pattern => stringToRegex(pattern));284const repo = reposToFetch[j];285const rulesForRepo = { patterns, ifAnyMatch, ifNoneMatch };286this._contentExclusionCache.set(repo, rulesForRepo);287this._logService.trace(`Fetched content exclusion rules for ${repo}: ${JSON.stringify(rulesForRepo)}`);288}289};290291// This is needed to fetch the global rules that could apply to non git files292if (remoteFetchUrls.length === 0) {293await updateRulesForRepos([]);294}295296// Process in batches of 10 as that's the max content exclusion rules we can fetch at a time297for (let i = 0; i < remoteFetchUrls.length; i += 10) {298const batch = remoteFetchUrls.slice(i, i + 10);299await updateRulesForRepos(batch);300}301this._lastRuleFetch = Date.now();302this._logService.info(`Fetched content exclusion rules in ${Date.now() - startTime}ms`);303304// Log the fetched rules to the request logger for debugging visibility305const repos = Array.from(this._contentExclusionCache.keys());306const rules = repos.map(repo => {307const entry = this._contentExclusionCache.get(repo)!;308return {309patterns: entry.patterns,310ifAnyMatch: entry.ifAnyMatch.map(r => r.toString()),311ifNoneMatch: entry.ifNoneMatch.map(r => r.toString())312};313});314this._requestLogger.logContentExclusionRules(repos, rules, Date.now() - startTime);315}316317318private getRepositoryInfo(repo: Pick<RepoContext, 'rootUri' | 'remoteFetchUrls'> | undefined): RepoMetadata | undefined {319if (!repo || !repo.remoteFetchUrls) {320return undefined;321}322const fetchUrls = coalesce(repo.remoteFetchUrls.map(url => {323if (!url) {324return undefined;325}326// This can throw when the URL is something like a local file path which is a valid git remote327try {328return normalizeFetchUrl(url);329} catch {330return undefined;331}332}));333return { repoRootPath: repo.rootUri.path, fetchUrls: fetchUrls };334}335336/**337* Finds cached repository metadata for a file by checking if the file path338* starts with any known repository root path.339* Returns the most specific (longest) matching repository to handle nested repos/submodules correctly.340* This avoids expensive calls to the git extension API for every file.341*/342private findCachedRepoMetadataForFile(file: URI): RepoMetadata | undefined {343const filePath = file.path.toLowerCase();344let bestMatch: RepoMetadata | undefined;345let bestMatchLength = 0;346347for (const [repoRootPath, metadata] of this._repoRootCache.entries()) {348const normalizedRepoRoot = repoRootPath.toLowerCase();349if ((filePath.startsWith(normalizedRepoRoot + '/') || filePath === normalizedRepoRoot) &&350normalizedRepoRoot.length > bestMatchLength) {351bestMatch = metadata;352bestMatchLength = normalizedRepoRoot.length;353}354}355return bestMatch;356}357}358359/**360* Convert a given string /pattern/flags to a RegExp object361*/362function stringToRegex(str: string): RegExp {363// Handle Regex format of `pattern` vs /pattern/364if (!str.startsWith('/') && !str.endsWith('/')) {365return new RegExp(str);366}367368// Extracting the content between the first and last slash as the pattern369const pattern = str.slice(1, str.lastIndexOf('/'));370// Extracting the flags after the last slash371const flags = str.slice(str.lastIndexOf('/') + 1);372// Creating the RegExp object373return new RegExp(pattern, flags);374}375376377