Path: blob/main/src/vs/platform/agentHost/test/node/protocol/toolApprovalRealSdk.integrationTest.ts
13405 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45/**6* Integration tests using the real Copilot SDK instead of a mock agent.7*8* These tests are **disabled by default**. To run them, set `AGENT_HOST_REAL_SDK=1`:9*10* AGENT_HOST_REAL_SDK=1 ./scripts/test-integration.sh --run src/vs/platform/agentHost/test/node/protocol/toolApprovalRealSdk.integrationTest.ts11*12* Authentication: By default the token is obtained from `gh auth token`.13* You can override it by setting `GITHUB_TOKEN=ghp_xxx`.14*15* SAFETY: These tests create real agent sessions backed by the Copilot SDK.16* The agent may execute tool calls on the user's machine. Prompts should be17* carefully chosen to avoid destructive side-effects — prefer read-only18* questions, safe commands like `echo`, and use isolated temp directories as19* working directories. Never ask the agent to delete, modify, or install20* anything outside of a test-owned temp directory.21*/2223import assert from 'assert';24import { execSync } from 'child_process';25import { mkdtempSync, rmSync, writeFileSync } from 'fs';26import { tmpdir } from 'os';27import { removeAnsiEscapeCodes } from '../../../../../base/common/strings.js';28import { URI } from '../../../../../base/common/uri.js';29import type { SessionToolCallStartAction } from '../../../common/state/protocol/actions.js';30import { SubscribeResult } from '../../../common/state/protocol/commands.js';31import { PROTOCOL_VERSION } from '../../../common/state/sessionCapabilities.js';32import { ResponsePartKind, ROOT_STATE_URI, SessionInputAnswerState, SessionInputAnswerValueKind, SessionInputQuestionKind, SessionInputResponseKind, ToolResultContentType, isSubagentSession, type SessionInputAnswer, type SessionInputRequest, type SessionState, type TerminalState, type ToolResultContent, type ToolResultSubagentContent } from '../../../common/state/sessionState.js';33import type { RootState } from '../../../common/state/protocol/state.js';34import type { RootAgentsChangedAction, SessionAddedNotification, SessionInputRequestedAction, SessionToolCallReadyAction } from '../../../common/state/sessionActions.js';35import type { INotificationBroadcastParams } from '../../../common/state/sessionProtocol.js';36import {37getActionEnvelope,38isActionNotification,39IServerHandle,40startRealServer,41TestProtocolClient,42} from './testHelpers.js';4344const REAL_SDK_ENABLED = process.env['AGENT_HOST_REAL_SDK'] === '1';4546/** Resolve GitHub token from env or `gh auth token`. */47function resolveGitHubToken(): string {48const envToken = process.env['GITHUB_TOKEN'];49if (envToken) {50return envToken;51}52try {53return execSync('gh auth token', { encoding: 'utf-8' }).trim();54} catch {55throw new Error('No GITHUB_TOKEN set and `gh auth token` failed. Run `gh auth login` first.');56}57}5859/** Create a session using the real copilot provider, authenticate, subscribe, and return the session URI. */60async function createRealSession(c: TestProtocolClient, clientId: string, trackingList: string[], workingDirectory?: string): Promise<string> {61const result = await createRealSessionFull(c, clientId, trackingList, workingDirectory);62return result.sessionUri;63}6465interface IRealSessionResult {66sessionUri: string;67addedNotification: SessionAddedNotification;68subscribeSnapshot: SessionState;69}7071/** Full version that returns the sessionAdded notification and subscribe snapshot for assertions. */72async function createRealSessionFull(c: TestProtocolClient, clientId: string, trackingList: string[], workingDirectory?: string): Promise<IRealSessionResult> {73await c.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId }, 30_000);7475await c.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() }, 30_000);7677const sessionUri = URI.from({ scheme: 'copilotcli', path: `/real-test-${Date.now()}` }).toString();78await c.call('createSession', { session: sessionUri, provider: 'copilotcli', workingDirectory }, 30_000);7980const notif = await c.waitForNotification(n =>81n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',8215_000,83);84const addedNotification = (notif.params as INotificationBroadcastParams).notification as SessionAddedNotification;85const realSessionUri = addedNotification.summary.resource;86trackingList.push(realSessionUri);8788const subscribeResult = await c.call<SubscribeResult>('subscribe', { resource: realSessionUri });89const subscribeSnapshot = subscribeResult.snapshot.state as SessionState;90c.clearReceived();9192return { sessionUri: realSessionUri, addedNotification, subscribeSnapshot };93}9495/** Dispatch a turn with the given user message text. */96function dispatchTurn(c: TestProtocolClient, session: string, turnId: string, text: string, clientSeq: number): void {97c.notify('dispatchAction', {98clientSeq,99action: {100type: 'session/turnStarted',101session,102turnId,103userMessage: { text },104},105});106}107108function getAcceptedAnswers(request: SessionInputRequest): Record<string, SessionInputAnswer> | undefined {109if (!request.questions?.length) {110return undefined;111}112113return Object.fromEntries(request.questions.map(question => {114switch (question.kind) {115case SessionInputQuestionKind.Text:116return [question.id, {117state: SessionInputAnswerState.Submitted,118value: {119kind: SessionInputAnswerValueKind.Text,120value: question.defaultValue ?? 'interactive',121},122} satisfies SessionInputAnswer];123case SessionInputQuestionKind.Number:124case SessionInputQuestionKind.Integer:125return [question.id, {126state: SessionInputAnswerState.Submitted,127value: {128kind: SessionInputAnswerValueKind.Number,129value: question.defaultValue ?? question.min ?? 1,130},131} satisfies SessionInputAnswer];132case SessionInputQuestionKind.Boolean:133return [question.id, {134state: SessionInputAnswerState.Submitted,135value: {136kind: SessionInputAnswerValueKind.Boolean,137value: question.defaultValue ?? true,138},139} satisfies SessionInputAnswer];140case SessionInputQuestionKind.SingleSelect: {141const preferredOption = question.options.find(option => /interactive/i.test(option.id) || /interactive/i.test(option.label))142?? question.options.find(option => option.recommended)143?? question.options[0];144return [question.id, {145state: SessionInputAnswerState.Submitted,146value: {147kind: SessionInputAnswerValueKind.Selected,148value: preferredOption.id,149},150} satisfies SessionInputAnswer];151}152case SessionInputQuestionKind.MultiSelect: {153const preferredOptions = question.options.filter(option => option.recommended);154const selectedOptions = preferredOptions.length > 0 ? preferredOptions : question.options.slice(0, 1);155return [question.id, {156state: SessionInputAnswerState.Submitted,157value: {158kind: SessionInputAnswerValueKind.SelectedMany,159value: selectedOptions.map(option => option.id),160},161} satisfies SessionInputAnswer];162}163}164}));165}166167function getMarkdownResponseText(c: TestProtocolClient): string {168// Markdown content arrives as a `session/responsePart` action that opens169// the part with the first chunk, followed by `session/delta` actions170// appending subsequent chunks. Concatenate both to get the full text.171const markdownPartIds = new Set<string>();172const pieces: string[] = [];173for (const notification of c.receivedNotifications(n =>174isActionNotification(n, 'session/responsePart') || isActionNotification(n, 'session/delta')175)) {176const action = getActionEnvelope(notification).action;177if (action.type === 'session/responsePart' && action.part.kind === ResponsePartKind.Markdown) {178markdownPartIds.add(action.part.id);179pieces.push(action.part.content);180} else if (action.type === 'session/delta' && markdownPartIds.has(action.partId)) {181pieces.push(action.content);182}183}184return pieces.join('');185}186187interface IDrivenTurnResult {188sawInputRequest: boolean;189sawPendingConfirmation: boolean;190responseText: string;191}192193async function driveTurnToCompletion(c: TestProtocolClient, session: string, turnId: string, text: string, clientSeq: number): Promise<IDrivenTurnResult> {194c.clearReceived();195dispatchTurn(c, session, turnId, text, clientSeq);196197const seenNotifications = new Set<object>();198let nextClientSeq = clientSeq + 1;199let sawInputRequest = false;200let sawPendingConfirmation = false;201202while (true) {203const notification = await c.waitForNotification(n => !seenNotifications.has(n as object) && (204isActionNotification(n, 'session/toolCallReady')205|| isActionNotification(n, 'session/inputRequested')206|| isActionNotification(n, 'session/turnComplete')207|| isActionNotification(n, 'session/error')208), 90_000);209seenNotifications.add(notification as object);210211if (isActionNotification(notification, 'session/error')) {212throw new Error(`Session error while driving ${turnId}`);213}214215if (isActionNotification(notification, 'session/toolCallReady')) {216const action = getActionEnvelope(notification).action as SessionToolCallReadyAction;217if (!action.confirmed) {218sawPendingConfirmation = true;219c.notify('dispatchAction', {220clientSeq: nextClientSeq++,221action: {222type: 'session/toolCallConfirmed',223session,224turnId,225toolCallId: action.toolCallId,226approved: true,227},228});229}230continue;231}232233if (isActionNotification(notification, 'session/inputRequested')) {234sawInputRequest = true;235const action = getActionEnvelope(notification).action as SessionInputRequestedAction;236c.notify('dispatchAction', {237clientSeq: nextClientSeq++,238action: {239type: 'session/inputCompleted',240session,241requestId: action.request.id,242response: SessionInputResponseKind.Accept,243answers: getAcceptedAnswers(action.request),244},245});246continue;247}248249break;250}251252return {253sawInputRequest,254sawPendingConfirmation,255responseText: getMarkdownResponseText(c),256};257}258259function terminalResourceFromContent(content: readonly ToolResultContent[]): string | undefined {260const terminalContent = content.find(c => c.type === ToolResultContentType.Terminal);261return terminalContent?.resource;262}263264function terminalText(state: TerminalState): string {265return removeAnsiEscapeCodes(state.content.map(part => part.type === 'command' ? `${part.commandLine}\n${part.output}` : part.value).join(''));266}267268/** Looks up the toolName for a toolCallReady by joining against the matching toolCallStart. */269function findToolNameForCall(c: TestProtocolClient, toolCallId: string): string | undefined {270return c.receivedNotifications(n => isActionNotification(n, 'session/toolCallStart'))271.map(n => getActionEnvelope(n).action as SessionToolCallStartAction)272.find(a => a.toolCallId === toolCallId)?.toolName;273}274275interface IApprovalRule {276/** Tool name this rule applies to (e.g. `'bash'`, `'write_bash'`). */277toolName: string;278/** Optional predicate over the tool input. If omitted, any input matches. */279matchInput?: (toolInput: string | undefined) => boolean;280/**281* Optional inspector run for every matched call before approval.282* Push assertion failure messages onto `errors` to fail the test.283*/284inspect?: (info: {285action: SessionToolCallReadyAction;286errors: string[];287}) => void;288}289290interface IBackgroundApprovalLoopOptions {291/** Starting clientSeq for dispatched toolCallConfirmed actions. Avoids collisions with the test's own dispatches. */292approvalSeqStart: number;293/**294* Allow-list of tool calls the loop is permitted to auto-approve. Each295* pending confirmation must match exactly one rule (by `toolName` plus296* optional `matchInput` predicate). Calls that don't match are recorded297* as errors and denied — the loop refuses to rubber-stamp anything the298* test didn't anticipate (e.g. an unexpected `rm` from the model).299*/300allow: readonly IApprovalRule[];301}302303interface IBackgroundApprovalLoop {304/** Errors collected during the run (unmatched tool calls + inspector failures). */305readonly errors: readonly string[];306/** Tool names that were observed and approved at least once. */307readonly approvedToolNames: ReadonlySet<string>;308/**309* Tool names for every permission request observed by the loop, regardless310* of whether they matched the allow-list. Useful for asserting that a311* tool with `skipPermission: true` never triggered a permission flow.312*/313readonly observedToolNames: ReadonlySet<string>;314/** Stops the loop and waits for it to drain. */315stop(): Promise<void>;316}317318/**319* Starts a background loop that auto-approves pending tool call confirmations320* during a real-SDK turn, but only if they match the supplied allow-list.321* Anything outside the allow-list is denied and recorded as an error so the322* test fails loudly instead of silently approving model-chosen tool calls.323*324* Implementation note: `waitForNotification` does NOT consume notifications from325* the client's queue, so we dedupe by `serverSeq`.326*/327function startBackgroundApprovalLoop(c: TestProtocolClient, options: IBackgroundApprovalLoopOptions): IBackgroundApprovalLoop {328const errors: string[] = [];329const approvedToolNames = new Set<string>();330const observedToolNames = new Set<string>();331const processedSeqs = new Set<number>();332let active = true;333let approvalSeq = options.approvalSeqStart;334335const loop = (async () => {336while (active) {337try {338const ready = await c.waitForNotification(n => {339if (!isActionNotification(n, 'session/toolCallReady')) {340return false;341}342return !processedSeqs.has(getActionEnvelope(n).serverSeq);343}, 2_000);344const envelope = getActionEnvelope(ready);345processedSeqs.add(envelope.serverSeq);346const action = envelope.action as SessionToolCallReadyAction & { session: string; turnId: string };347if (action.confirmed) {348continue;349}350351const toolName = findToolNameForCall(c, action.toolCallId);352if (toolName) {353observedToolNames.add(toolName);354}355const matchingRule = options.allow.find(rule =>356rule.toolName === toolName357&& (rule.matchInput?.(action.toolInput) ?? true));358359if (!matchingRule) {360errors.push(`unexpected tool call: toolName=${toolName ?? '<unknown>'} input=${JSON.stringify(action.toolInput)}`);361c.notify('dispatchAction', {362clientSeq: ++approvalSeq,363action: {364type: 'session/toolCallConfirmed',365session: action.session,366turnId: action.turnId,367toolCallId: action.toolCallId,368approved: false,369},370});371continue;372}373374matchingRule.inspect?.({ action, errors });375approvedToolNames.add(matchingRule.toolName);376377c.notify('dispatchAction', {378clientSeq: ++approvalSeq,379action: {380type: 'session/toolCallConfirmed',381session: action.session,382turnId: action.turnId,383toolCallId: action.toolCallId,384approved: true,385},386});387} catch (e) {388// Only ignore the expected 2-second poll timeout. Any other error389// (e.g. 'Client closed', exception from matchingRule.inspect) is a390// real failure — record it so the test fails deterministically.391const msg = e instanceof Error ? e.message : String(e);392if (!msg.includes('Timed out') && !msg.includes('timed out')) {393errors.push(`approval loop error: ${msg}`);394active = false;395}396}397}398})();399400return {401errors,402approvedToolNames,403observedToolNames,404async stop(): Promise<void> {405active = false;406await loop;407},408};409}410411(REAL_SDK_ENABLED ? suite : suite.skip)('Protocol WebSocket — Real Copilot SDK', function () {412413let server: IServerHandle;414let client: TestProtocolClient;415/** Session URIs created during the current test, disposed in teardown. */416const createdSessions: string[] = [];417/** Temp directories created during the current test, removed in teardown. */418const tempDirs: string[] = [];419420suiteSetup(async function () {421this.timeout(60_000);422server = await startRealServer();423});424425suiteTeardown(function () {426server?.process.kill();427});428429setup(async function () {430this.timeout(30_000);431client = new TestProtocolClient(server.port);432await client.connect();433});434435teardown(async function () {436// Dispose all sessions created during this test437for (const session of createdSessions) {438try {439await client.call('disposeSession', { session }, 5000);440} catch {441// Best-effort cleanup — the session may already be gone442}443}444createdSessions.length = 0;445client.close();446447// Remove temp directories created during this test. On Windows the448// agent subprocess can still hold handles to the working directory for449// a brief moment after `disposeSession` returns, which surfaces as450// EBUSY. Retry a few times to give the OS a chance to release the451// handle before failing the teardown.452for (const dir of tempDirs) {453try {454rmSync(dir, { recursive: true, force: true, maxRetries: 5, retryDelay: 200 });455} catch {456// Best-effort cleanup — leftover temp dirs in os.tmpdir() are457// harmless and shouldn't fail an otherwise passing test.458}459}460tempDirs.length = 0;461});462463// ---- Basic turn execution ------------------------------------------------464465test('sends a simple message and receives a response', async function () {466this.timeout(120_000);467468const sessionUri = await createRealSession(client, 'real-sdk-simple', createdSessions, URI.file(tmpdir()).toString());469dispatchTurn(client, sessionUri, 'turn-1', 'Say exactly "hello" and nothing else', 1);470471// Wait for the turn to complete — the real SDK may take a while472await client.waitForNotification(n => isActionNotification(n, 'session/turnComplete'), 90_000);473474// Verify we received at least one response part475const responseParts = client.receivedNotifications(n => isActionNotification(n, 'session/responsePart'));476assert.ok(responseParts.length > 0, 'should have received at least one response part');477});478479// ---- Tool call with permission flow -------------------------------------480481test('tool call triggers permission request and can be approved', async function () {482this.timeout(120_000);483484const tempDir = mkdtempSync(`${tmpdir()}/ahp-perm-test-`);485tempDirs.push(tempDir);486const sessionUri = await createRealSession(client, 'real-sdk-permission', createdSessions, URI.file(tempDir).toString());487dispatchTurn(client, sessionUri, 'turn-perm', 'Run the shell command: echo "hello from test"', 1);488489// The real SDK should fire a tool call that needs permission490const toolStartNotif = await client.waitForNotification(491n => isActionNotification(n, 'session/toolCallStart'),49260_000,493);494const toolStartAction = getActionEnvelope(toolStartNotif).action as { toolCallId: string };495496// Wait for toolCallReady (pending confirmation)497const toolReadyNotif = await client.waitForNotification(498n => isActionNotification(n, 'session/toolCallReady'),49930_000,500);501const toolReadyAction = getActionEnvelope(toolReadyNotif).action as { toolCallId: string; confirmed?: string };502503// If the tool was auto-approved, confirmed will be set; if pending, confirm it504if (!toolReadyAction.confirmed) {505client.notify('dispatchAction', {506clientSeq: 2,507action: {508type: 'session/toolCallConfirmed',509session: sessionUri,510turnId: 'turn-perm',511toolCallId: toolStartAction.toolCallId,512approved: true,513},514});515}516517// Wait for the turn to complete518await client.waitForNotification(n => isActionNotification(n, 'session/turnComplete'), 90_000);519});520521test('planning-mode session-state writes are auto-approved in default mode', async function () {522this.timeout(180_000);523524const tempDir = mkdtempSync(`${tmpdir()}/ahp-plan-test-`);525tempDirs.push(tempDir);526const sessionUri = await createRealSession(client, 'real-sdk-plan-mode', createdSessions, URI.file(tempDir).toString());527528// Switch the session into plan mode via the standard config-change flow529// before sending the first turn. The agent host reads this value at530// turn-start time and pushes it to the SDK via `rpc.mode.set`.531client.notify('dispatchAction', {532clientSeq: 1,533action: {534type: 'session/configChanged',535session: sessionUri,536config: { mode: 'plan' },537},538});539await client.waitForNotification(n => isActionNotification(n, 'session/configChanged'));540541const planTurn = await driveTurnToCompletion(client, sessionUri, 'turn-plan',542'Help me implement a Python script that prints "hello world" to stdout. Write the shortest possible plan to your session plan.md and use the exit_plan_mode tool to ask me to approve it before writing any code.', 2);543assert.strictEqual(planTurn.sawPendingConfirmation, false, 'should not have received pending-confirmation toolCallReady while writing session-state plan.md');544assert.ok(planTurn.sawInputRequest, 'should reach the exit_plan_mode question so the test can continue the same session');545546const extraSessionNotificationsAfterPlan = client.receivedNotifications(n =>547n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',548);549assert.strictEqual(extraSessionNotificationsAfterPlan.length, 0, 'should not create a second session while answering the plan-mode question');550551// Mirror what a real UI client would do after the user accepted the552// plan: update the session config so subsequent turns no longer run553// in plan mode. Without this the agent host would re-set the SDK's554// mode to 'plan' at the next send because the session config still555// holds the original 'plan' value.556client.notify('dispatchAction', {557clientSeq: 50,558action: {559type: 'session/configChanged',560session: sessionUri,561config: { mode: 'interactive' },562},563});564await client.waitForNotification(n => isActionNotification(n, 'session/configChanged'));565566const followupTurn = await driveTurnToCompletion(client, sessionUri, 'turn-followup',567'What did the plan I just approved say to print? Reply with exactly "hello world".', 100,568);569assert.strictEqual(followupTurn.sawPendingConfirmation, false, 'follow-up turn should not surface new pending confirmations');570assert.match(followupTurn.responseText, /hello world/i, 'follow-up turn should retain the original plan context');571572const extraSessionNotificationsAfterFollowup = client.receivedNotifications(n =>573n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',574);575assert.strictEqual(extraSessionNotificationsAfterFollowup.length, 0, 'sending another message should stay on the same session instead of forking');576577const resubscribeResult = await client.call<SubscribeResult>('subscribe', { resource: sessionUri });578const finalSnapshot = resubscribeResult.snapshot.state as SessionState;579assert.strictEqual(finalSnapshot.summary.resource, sessionUri, 'follow-up turn should keep the original session resource');580});581582// ---- Abort / cancel -----------------------------------------------------583584test('can abort a running turn', async function () {585this.timeout(120_000);586587const sessionUri = await createRealSession(client, 'real-sdk-abort', createdSessions, URI.file(tmpdir()).toString());588dispatchTurn(client, sessionUri, 'turn-abort', 'Write a very long essay about the history of computing', 1);589590// Wait a moment for the turn to start processing, then abort591await client.waitForNotification(592n => isActionNotification(n, 'session/responsePart') || isActionNotification(n, 'session/toolCallStart'),59360_000,594);595596client.notify('dispatchAction', {597clientSeq: 2,598action: {599type: 'session/abortTurn',600session: sessionUri,601},602});603604// Verify the abort action was echoed back by the server.605// We don't wait for turnComplete because the real Copilot SDK may606// continue streaming after abort and the turn may not terminate within607// the test timeout.608await client.waitForNotification(609n => isActionNotification(n, 'session/abortTurn'),61010_000,611);612});613614// ---- Working directory correctness --------------------------------------615616test('session is created with the correct working directory', async function () {617this.timeout(120_000);618619// Use a real temp directory so the path exists on disk.620// Clean it up at the end to avoid leaving test artifacts.621const tempDir = mkdtempSync(`${tmpdir()}/ahp-test-`);622tempDirs.push(tempDir);623const workingDirUri = URI.file(tempDir).toString();624625await client.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId: 'real-sdk-workdir' });626await client.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() });627628const sessionUri = URI.from({ scheme: 'copilotcli', path: `/real-test-wd-${Date.now()}` }).toString();629await client.call('createSession', { session: sessionUri, provider: 'copilotcli', workingDirectory: workingDirUri });630631// 1. Verify workingDirectory in the sessionAdded notification632const addedNotif = await client.waitForNotification(n =>633n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',63415_000,635);636const addedSummary = ((addedNotif.params as INotificationBroadcastParams).notification as SessionAddedNotification).summary;637createdSessions.push(addedSummary.resource);638assert.strictEqual(639addedSummary.workingDirectory,640workingDirUri,641`sessionAdded notification should carry the requested working directory`,642);643644// 2. Subscribe and verify workingDirectory in the session state snapshot645const subscribeResult = await client.call<SubscribeResult>('subscribe', { resource: addedSummary.resource });646const sessionState = subscribeResult.snapshot.state as SessionState;647assert.strictEqual(648sessionState.summary.workingDirectory,649workingDirUri,650`subscribe snapshot summary should carry the requested working directory`,651);652});653654// ---- Worktree isolation -------------------------------------------------655656test('worktree session uses the resolved worktree as working directory', async function () {657this.timeout(120_000);658659// Set up a minimal git repo so the server can create a worktree660const tempDir = mkdtempSync(`${tmpdir()}/ahp-wt-test-`);661tempDirs.push(tempDir, `${tempDir}.worktrees`);662execSync('git init', { cwd: tempDir });663execSync('git config user.name "Agent Host Test"', { cwd: tempDir });664execSync('git config user.email "[email protected]"', { cwd: tempDir });665execSync('git commit --allow-empty -m "init"', { cwd: tempDir });666const defaultBranch = execSync('git branch --show-current', { cwd: tempDir, encoding: 'utf-8' }).trim();667const workingDirUri = URI.file(tempDir).toString();668669await client.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId: 'real-sdk-worktree' });670await client.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() });671672const sessionUri = URI.from({ scheme: 'copilotcli', path: `/real-test-wt-${Date.now()}` }).toString();673await client.call('createSession', {674session: sessionUri,675provider: 'copilotcli',676workingDirectory: workingDirUri,677config: { isolation: 'worktree', branch: defaultBranch },678});679680const addedNotif = await client.waitForNotification(n =>681n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',68215_000,683);684const addedSummary = ((addedNotif.params as INotificationBroadcastParams).notification as SessionAddedNotification).summary;685createdSessions.push(addedSummary.resource);686687// Subscribe so we receive action broadcasts for this session688await client.call<SubscribeResult>('subscribe', { resource: addedSummary.resource });689690// Verify the worktree path is in the summary691assert.ok(692addedSummary.workingDirectory,693'sessionAdded notification should have a workingDirectory',694);695assert.ok(696addedSummary.workingDirectory!.includes('.worktrees'),697`workingDirectory should be under the .worktrees folder, got: ${addedSummary.workingDirectory}`,698);699const resolvedWorkingDirectoryPath = URI.parse(addedSummary.workingDirectory!).fsPath;700701// Set the active client with tools (matching real VS Code flow where702// activeClientChanged is dispatched AFTER createSession). When the next703// sendMessage detects the tools changed vs the session's creation-time704// snapshot, it disposes the SDK session and re-creates it via705// _resumeSession. That resume path must use the worktree working706// directory, not the original repo path.707client.notify('dispatchAction', {708clientSeq: 1,709action: {710type: 'session/activeClientChanged',711session: addedSummary.resource,712activeClient: {713clientId: 'real-sdk-worktree',714displayName: 'Test Client',715tools: [716{717name: 'test_echo',718description: 'A harmless echo tool for testing',719inputSchema: { type: 'object', properties: { message: { type: 'string' } } },720},721],722},723},724});725726// Send a turn — this triggers sendMessage, which will detect the tools727// changed and refresh the session (dispose + _resumeSession). The728// resumed session should still have the worktree as its working729// directory. Ask a safe, read-only question about the working directory.730client.clearReceived();731dispatchTurn(client, addedSummary.resource, 'turn-wt',732'What is your current working directory? Reply with just the absolute path and nothing else.', 2);733734// Wait for the turn to complete or error735await client.waitForNotification(736n => isActionNotification(n, 'session/turnComplete') || isActionNotification(n, 'session/error'),73790_000,738);739740// The session refresh should succeed — if it errors with741// "workingDirectory is required to resume", the worktree path was lost.742const errors = client.receivedNotifications(n => isActionNotification(n, 'session/error'));743assert.strictEqual(errors.length, 0,744errors.length > 0745? `Session error during turn (worktree path lost on resume): ${(getActionEnvelope(errors[0]).action as { error?: { message?: string } }).error?.message}`746: '',747);748749// Verify the turn got a response (the session resumed successfully)750const responseParts = client.receivedNotifications(n => isActionNotification(n, 'session/responsePart'));751assert.ok(responseParts.length > 0, 'should have received at least one response part after session refresh');752753client.clearReceived();754dispatchTurn(client, addedSummary.resource, 'turn-wt-terminal', 'Run the shell command: pwd', 3);755756const toolStartNotif = await client.waitForNotification(757n => isActionNotification(n, 'session/toolCallStart'),75860_000,759);760const toolStartAction = getActionEnvelope(toolStartNotif).action as { toolCallId: string };761762const toolReadyNotif = await client.waitForNotification(763n => isActionNotification(n, 'session/toolCallReady'),76430_000,765);766const toolReadyAction = getActionEnvelope(toolReadyNotif).action as { confirmed?: string };767if (!toolReadyAction.confirmed) {768client.notify('dispatchAction', {769clientSeq: 4,770action: {771type: 'session/toolCallConfirmed',772session: addedSummary.resource,773turnId: 'turn-wt-terminal',774toolCallId: toolStartAction.toolCallId,775approved: true,776},777});778}779780const terminalContentNotif = await client.waitForNotification(n => {781if (!isActionNotification(n, 'session/toolCallContentChanged')) {782return false;783}784const action = getActionEnvelope(n).action as { toolCallId: string; content: readonly ToolResultContent[] };785return action.toolCallId === toolStartAction.toolCallId && terminalResourceFromContent(action.content) !== undefined;786}, 30_000);787const terminalContentAction = getActionEnvelope(terminalContentNotif).action as { content: readonly ToolResultContent[] };788const terminalUri = terminalResourceFromContent(terminalContentAction.content);789assert.ok(terminalUri, 'shell tool should expose its terminal resource');790791const terminalSubscribeResult = await client.call<SubscribeResult>('subscribe', { resource: terminalUri });792const initialTerminalState = terminalSubscribeResult.snapshot.state as TerminalState;793assert.strictEqual(initialTerminalState.cwd, resolvedWorkingDirectoryPath, 'terminal should be created in the resolved worktree directory');794795await client.waitForNotification(n => isActionNotification(n, 'session/turnComplete'), 90_000);796const terminalSnapshot = await client.call<SubscribeResult>('subscribe', { resource: terminalUri });797const terminalState = terminalSnapshot.snapshot.state as TerminalState;798assert.ok(terminalText(terminalState).includes(resolvedWorkingDirectoryPath), `pwd output should include the resolved worktree path ${resolvedWorkingDirectoryPath}`);799});800801// ---- Subagent tool call grouping ----------------------------------------802803test('subagent tool calls are routed to the subagent session, not flat in the parent', async function () {804this.timeout(180_000);805806// Set up a small fixture directory so the subagent has something to view.807const tempDir = mkdtempSync(`${tmpdir()}/ahp-subagent-test-`);808tempDirs.push(tempDir);809writeFileSync(`${tempDir}/file-a.txt`, 'alpha');810writeFileSync(`${tempDir}/file-b.txt`, 'beta');811812const sessionUri = await createRealSession(client, 'real-sdk-subagent', createdSessions, URI.file(tempDir).toString());813814// Auto-approve every tool that needs confirmation while the turn runs.815// Multiple inner tool calls may need approval; doing this in a background816// loop keeps the turn unblocked. Track processed serverSeqs so we don't817// busy-spin on already-handled notifications (waitForNotification returns818// matching notifications from the queue without consuming them). Using819// serverSeq rather than toolCallId allows the same tool to be legitimately820// re-confirmed in a later notification.821let approvalsActive = true;822let approvalSeq = 1000;823const processedSeqs = new Set<number>();824const approvalLoop = (async () => {825while (approvalsActive) {826try {827const ready = await client.waitForNotification(n => {828if (!isActionNotification(n, 'session/toolCallReady')) {829return false;830}831const envelope = getActionEnvelope(n);832const a = envelope.action as { confirmed?: string };833return !a.confirmed && !processedSeqs.has(envelope.serverSeq);834}, 2_000);835const envelope = getActionEnvelope(ready);836if (!processedSeqs.has(envelope.serverSeq)) {837processedSeqs.add(envelope.serverSeq);838const action = envelope.action as { session: string; turnId: string; toolCallId: string; confirmed?: string };839if (!action.confirmed) {840client.notify('dispatchAction', {841clientSeq: ++approvalSeq,842action: {843type: 'session/toolCallConfirmed',844session: action.session,845turnId: action.turnId,846toolCallId: action.toolCallId,847approved: true,848},849});850}851}852} catch {853// Timeout — re-poll. Loop exits when approvalsActive flips.854}855}856})();857858// Encourage the model to delegate via the `task` subagent tool. The exact859// behaviour is non-deterministic — if the model declines we fail the test860// with a clear message rather than silently passing.861dispatchTurn(client, sessionUri, 'turn-sa',862'Use the `task` tool to spawn a subagent to list the files in the current working directory. ' +863'The subagent should call a single read-only tool (e.g. `view` or `bash` with `ls`) to enumerate the directory. ' +864'Do not enumerate the directory yourself — delegate to the subagent.',8651);866867// Wait for the parent's `task` tool call to expose a Subagent content868// block carrying the subagent session URI.869const subagentContentNotif = await client.waitForNotification(n => {870if (!isActionNotification(n, 'session/toolCallContentChanged')) {871return false;872}873const action = getActionEnvelope(n).action as { session: string; content: readonly ToolResultContent[] };874return action.session === sessionUri && action.content.some(c => c.type === ToolResultContentType.Subagent);875}, 120_000);876877const parentContent = (getActionEnvelope(subagentContentNotif).action as { content: readonly ToolResultContent[] }).content;878const subagentRef = parentContent.find((c): c is ToolResultSubagentContent => c.type === ToolResultContentType.Subagent)!;879const subagentSessionUri = subagentRef.resource as unknown as string;880assert.ok(typeof subagentSessionUri === 'string' && isSubagentSession(subagentSessionUri),881`subagent session URI should be subagent-shaped, got: ${JSON.stringify(subagentSessionUri)}`);882883// Subscribe so we receive the subagent session's own action broadcasts.884await client.call<SubscribeResult>('subscribe', { resource: subagentSessionUri });885886// Wait for the parent turn to complete (with a generous timeout — the887// subagent's turn must finish first).888await client.waitForNotification(n => {889if (!isActionNotification(n, 'session/turnComplete')) {890return false;891}892return (getActionEnvelope(n).action as { session: string }).session === sessionUri;893}, 150_000);894895approvalsActive = false;896await approvalLoop;897898// Group all received toolCallStart actions by the session they target.899// This is the bug's signature: when inner tool_start arrives before900// subagent_started, the inner tool calls leak into the parent session.901const toolStarts = client.receivedNotifications(n => isActionNotification(n, 'session/toolCallStart'))902.map(n => getActionEnvelope(n).action as SessionToolCallStartAction);903904const parentStarts = toolStarts.filter(a => (a.session as unknown as string) === sessionUri);905const subagentStarts = toolStarts.filter(a => (a.session as unknown as string) === subagentSessionUri);906907// Parent should only carry the outer `task` tool call. Any other908// tool call on the parent indicates the inner-tool routing bug.909const parentNonTaskStarts = parentStarts.filter(a => a.toolName !== 'task');910assert.deepStrictEqual(911parentNonTaskStarts.map(a => a.toolName),912[],913`parent session should not contain inner tool calls; found: ${JSON.stringify(parentNonTaskStarts.map(a => a.toolName))}`,914);915916// Subagent session must have at least one inner tool call. If this917// fails, the subagent never actually executed any work — likely the918// model didn't delegate as instructed.919assert.ok(subagentStarts.length >= 1,920`subagent session should contain at least one inner tool call, got ${subagentStarts.length}. ` +921`Parent tool calls: ${JSON.stringify(parentStarts.map(a => a.toolName))}`);922});923924// ---- Model discovery -----------------------------------------------------925926test('listModels returns well-shaped model entries after authenticate', async function () {927this.timeout(60_000);928929await client.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId: 'real-sdk-list-models' }, 30_000);930931// Subscribe to root state *before* authenticating so we can observe932// the agentsChanged action that carries the populated model list.933const rootResult = await client.call<SubscribeResult>('subscribe', { resource: ROOT_STATE_URI }, 30_000);934const initial = rootResult.snapshot.state as RootState;935const copilotAgent = initial.agents.find(a => a.provider === 'copilotcli');936assert.ok(copilotAgent, `Expected copilotcli agent in root state, got: ${initial.agents.map(a => a.provider).join(', ')}`);937938await client.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() }, 30_000);939940// Models are loaded asynchronously after authenticate. Wait for the941// agentsChanged action that populates them.942const notif = await client.waitForNotification(n => {943if (!isActionNotification(n, 'root/agentsChanged')) {944return false;945}946const action = getActionEnvelope(n).action as RootAgentsChangedAction;947const agent = action.agents.find(a => a.provider === 'copilotcli');948return !!agent && agent.models.length > 0;949}, 30_000);950951const action = getActionEnvelope(notif).action as RootAgentsChangedAction;952const agent = action.agents.find(a => a.provider === 'copilotcli')!;953954assert.ok(agent.models.length > 0, 'Expected at least one model from listModels');955956// Assert every model has the shape CopilotAgent._listModels produces.957// maxContextWindow is optional because synthetic SDK entries (e.g. the958// `auto` router) ship with `capabilities: {}` and no fixed window.959for (const model of agent.models) {960assert.strictEqual(typeof model.id, 'string', `model.id should be a string: ${JSON.stringify(model)}`);961assert.ok(model.id.length > 0, `model.id should be non-empty: ${JSON.stringify(model)}`);962assert.strictEqual(typeof model.name, 'string', `model.name should be a string: ${JSON.stringify(model)}`);963assert.strictEqual(model.provider, 'copilotcli', `model.provider should be copilotcli: ${JSON.stringify(model)}`);964assert.ok(model.maxContextWindow === undefined || (typeof model.maxContextWindow === 'number' && model.maxContextWindow > 0),965`model.maxContextWindow should be undefined or a positive number: ${JSON.stringify(model)}`);966assert.ok(model.supportsVision === undefined || typeof model.supportsVision === 'boolean', `model.supportsVision should be boolean or undefined: ${JSON.stringify(model)}`);967}968969// The `auto` synthetic router model should be present even though it970// has no fixed context window.971assert.ok(agent.models.some(m => m.id === 'auto'), `Expected 'auto' model in list, got: ${agent.models.map(m => m.id).join(', ')}`);972});973974// ---- Redundant cd-prefix stripping --------------------------------------975976test('strips redundant `cd <workingDirectory> &&` prefix from shell tool calls', async function () {977this.timeout(180_000);978979const tempDir = mkdtempSync(`${tmpdir()}/ahp-cd-strip-test-`);980tempDirs.push(tempDir);981const expectedWorkingDirPath = tempDir;982const sessionUri = await createRealSession(client, 'real-sdk-cd-strip', createdSessions, URI.file(tempDir).toString());983984// Coax the model into producing a `cd <wd> && X` form. The exact text is985// non-deterministic, so the test asserts on rewrite behavior conditional986// on actually receiving a cd-prefixed command.987client.clearReceived();988dispatchTurn(client, sessionUri, 'turn-cd-strip',989`Run this exact shell command, do not modify it: cd ${expectedWorkingDirPath} && echo strip-me-please`,9901);991992// Wait for the toolCallReady action that carries the rewritten toolInput.993const toolReadyNotif = await client.waitForNotification(n => {994if (!isActionNotification(n, 'session/toolCallReady')) {995return false;996}997const action = getActionEnvelope(n).action as { toolInput?: string };998return typeof action.toolInput === 'string' && action.toolInput.includes('echo strip-me-please');999}, 90_000);10001001const toolReadyAction = getActionEnvelope(toolReadyNotif).action as { toolCallId: string; toolInput?: string; confirmed?: string };1002const toolInput = toolReadyAction.toolInput!;10031004// The core assertion: regardless of whether the model emitted the cd1005// prefix verbatim or already pre-stripped it, the toolInput surfaced to1006// the client must NOT contain the redundant `cd <tempDir> &&` prefix.1007// Use a regex that anchors to the start of the command and tolerates1008// optional surrounding quotes around the directory plus either `&&`1009// or `;` as the chain operator (so quoted variants like1010// `cd "<wd>" && …` and pwsh-style `cd <wd>; …` are both detected).1011const escapedWorkingDirPath = expectedWorkingDirPath.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');1012const redundantWorkingDirCdPrefix = new RegExp(1013`^\\s*cd\\s+(?:"${escapedWorkingDirPath}"|'${escapedWorkingDirPath}'|${escapedWorkingDirPath})\\s*(?:&&|;)\\s*`,1014);1015assert.ok(1016!redundantWorkingDirCdPrefix.test(toolInput),1017`toolInput should not contain a redundant cd-prefix targeting the working directory; got: ${JSON.stringify(toolInput)}`,1018);1019assert.ok(1020toolInput.includes('echo strip-me-please'),1021`toolInput should contain the rewritten command body; got: ${JSON.stringify(toolInput)}`,1022);10231024// Approve so the turn can complete. If it was already auto-confirmed1025// (`confirmed` is set), skip the manual approval.1026if (!toolReadyAction.confirmed) {1027client.notify('dispatchAction', {1028clientSeq: 2,1029action: {1030type: 'session/toolCallConfirmed',1031session: sessionUri,1032turnId: 'turn-cd-strip',1033toolCallId: toolReadyAction.toolCallId,1034approved: true,1035},1036});1037}10381039// Drive any further confirmations to completion so teardown is clean.1040while (true) {1041const next = await client.waitForNotification(1042n => isActionNotification(n, 'session/toolCallReady') || isActionNotification(n, 'session/turnComplete') || isActionNotification(n, 'session/error'),104390_000,1044);1045if (isActionNotification(next, 'session/turnComplete') || isActionNotification(next, 'session/error')) {1046break;1047}1048const action = getActionEnvelope(next).action as { session: string; turnId: string; toolCallId: string; confirmed?: string };1049if (!action.confirmed) {1050client.notify('dispatchAction', {1051clientSeq: 3,1052action: {1053type: 'session/toolCallConfirmed',1054session: action.session,1055turnId: action.turnId,1056toolCallId: action.toolCallId,1057approved: true,1058},1059});1060}1061}1062});10631064// ---- write_bash skipPermission regression test --------------------------10651066test('write_bash never triggers a permission request (skipPermission flag)', async function () {1067this.timeout(180_000);10681069// What this test verifies:1070// `write_bash` (and `read_bash` / `bash_shutdown` / `list_bash`) are1071// registered as external tools with `skipPermission: true`, mirroring1072// the SDK's built-in shell helpers which never call `permissions.request`.1073// This regression test catches accidental removal of that flag — if it's1074// removed, the SDK will route write_bash through our permission flow and1075// the test will fail with `observedToolNames` containing 'write_bash'.1076//1077// How it works:1078// 1. Allow-list permits ONLY `bash` (the interactive prompt). write_bash1079// is intentionally absent from the allow list.1080// 2. The model is instructed to use `write_bash`. If any permission1081// request appears for write_bash, the loop records it in1082// `observedToolNames` and we fail the assertion.1083// 3. We assert that bash actually ran AND that write_bash appeared in1084// toolCallStart notifications (so the test is non-vacuous — the model1085// actually tried to use the tool, not just piped input via bash).10861087const tempDir = mkdtempSync(`${tmpdir()}/ahp-write-bash-skip-perm-`);1088tempDirs.push(tempDir);1089const sessionUri = await createRealSession(client, 'real-sdk-write-bash-skip-perm', createdSessions, URI.file(tempDir).toString());10901091const approvalLoop = startBackgroundApprovalLoop(client, {1092approvalSeqStart: 100,1093allow: [1094{1095// Setup bash command — the interactive `read` prompt.1096toolName: 'bash',1097matchInput: input => !!input && input.includes('read') && input.includes('Got:'),1098},1099// Note: write_bash is intentionally NOT in the allow list. With1100// skipPermission: true, the SDK won't ask us — so the test passes.1101// Without it, the SDK would ask, the loop would deny + record an1102// error, and the test would fail loudly.1103],1104});11051106dispatchTurn(client, sessionUri, 'turn-write-bash-skip-perm',1107'You MUST demonstrate the `write_bash` tool. Steps, in order:\n' +1108'1. Use the `bash` tool to run exactly: read -p "Enter: " v; echo "Got: $v"\n' +1109' This will block waiting for stdin.\n' +1110'2. While that bash call is waiting, you MUST use the `write_bash` tool to send the input "hello\\n" to it.\n' +1111' Do NOT pipe the input via the original bash command. Do NOT use `echo hello | ...`.\n' +1112' You MUST go through the `write_bash` tool — that is the entire point of this task.\n' +1113'3. After the shell prints "Got: hello", reply with the single word "done".',11141);11151116await client.waitForNotification(1117n => isActionNotification(n, 'session/turnComplete') || isActionNotification(n, 'session/error'),1118150_000,1119);1120await approvalLoop.stop();11211122// Sanity check: the bash setup command actually ran. Otherwise the1123// model ignored the prompt and the write_bash assertion below is vacuous.1124assert.ok(approvalLoop.approvedToolNames.has('bash'),1125`expected the model to invoke bash for setup; observed approved tools: ${[...approvalLoop.approvedToolNames].join(', ') || '<none>'}`);11261127// Non-vacuousness check: write_bash must have actually been invoked1128// (seen in a toolCallStart notification). If the model piped input via1129// the original bash command instead of using write_bash, this fails.1130const writeBashStarts = client.receivedNotifications(n => isActionNotification(n, 'session/toolCallStart'))1131.map(n => getActionEnvelope(n).action as { toolName?: string })1132.filter(a => a.toolName === 'write_bash');1133assert.ok(writeBashStarts.length > 0,1134`expected write_bash to be invoked at least once (toolCallStart), but it was never called. The model may have piped input via the original bash command instead.`);11351136// The actual regression check: write_bash must never reach our1137// permission handler. If this fails, `skipPermission: true` was likely1138// removed from copilotShellTools.ts.1139assert.ok(!approvalLoop.observedToolNames.has('write_bash'),1140`write_bash should be auto-approved by the SDK (skipPermission: true) and never trigger a permission request, but the test observed one. Observed permission requests: ${[...approvalLoop.observedToolNames].join(', ')}`);11411142// Any other unexpected permission requests (e.g. an unrelated tool the1143// model decided to use) would also have been recorded as errors.1144assert.deepStrictEqual(approvalLoop.errors, [],1145`unexpected approval-loop errors: ${approvalLoop.errors.join('; ')}`);1146});1147});114811491150