CoCalc -- toolApprovalRealSdk.integrationTest.ts

GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/platform/agentHost/test/node/protocol/toolApprovalRealSdk.integrationTest.ts
¹³⁴⁰⁵ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
/**
7
 * Integration tests using the real Copilot SDK instead of a mock agent.
8
 *
9
 * These tests are **disabled by default**. To run them, set `AGENT_HOST_REAL_SDK=1`:
10
 *
11
 *   AGENT_HOST_REAL_SDK=1 ./scripts/test-integration.sh --run src/vs/platform/agentHost/test/node/protocol/toolApprovalRealSdk.integrationTest.ts
12
 *
13
 * Authentication: By default the token is obtained from `gh auth token`.
14
 * You can override it by setting `GITHUB_TOKEN=ghp_xxx`.
15
 *
16
 * SAFETY: These tests create real agent sessions backed by the Copilot SDK.
17
 * The agent may execute tool calls on the user's machine. Prompts should be
18
 * carefully chosen to avoid destructive side-effects — prefer read-only
19
 * questions, safe commands like `echo`, and use isolated temp directories as
20
 * working directories. Never ask the agent to delete, modify, or install
21
 * anything outside of a test-owned temp directory.
22
 */
23

24
import assert from 'assert';
25
import { execSync } from 'child_process';
26
import { mkdtempSync, rmSync, writeFileSync } from 'fs';
27
import { tmpdir } from 'os';
28
import { removeAnsiEscapeCodes } from '../../../../../base/common/strings.js';
29
import { URI } from '../../../../../base/common/uri.js';
30
import type { SessionToolCallStartAction } from '../../../common/state/protocol/actions.js';
31
import { SubscribeResult } from '../../../common/state/protocol/commands.js';
32
import { PROTOCOL_VERSION } from '../../../common/state/sessionCapabilities.js';
33
import { ResponsePartKind, ROOT_STATE_URI, SessionInputAnswerState, SessionInputAnswerValueKind, SessionInputQuestionKind, SessionInputResponseKind, ToolResultContentType, isSubagentSession, type SessionInputAnswer, type SessionInputRequest, type SessionState, type TerminalState, type ToolResultContent, type ToolResultSubagentContent } from '../../../common/state/sessionState.js';
34
import type { RootState } from '../../../common/state/protocol/state.js';
35
import type { RootAgentsChangedAction, SessionAddedNotification, SessionInputRequestedAction, SessionToolCallReadyAction } from '../../../common/state/sessionActions.js';
36
import type { INotificationBroadcastParams } from '../../../common/state/sessionProtocol.js';
37
import {
38
	getActionEnvelope,
39
	isActionNotification,
40
	IServerHandle,
41
	startRealServer,
42
	TestProtocolClient,
43
} from './testHelpers.js';
44

45
const REAL_SDK_ENABLED = process.env['AGENT_HOST_REAL_SDK'] === '1';
46

47
/** Resolve GitHub token from env or `gh auth token`. */
48
function resolveGitHubToken(): string {
49
	const envToken = process.env['GITHUB_TOKEN'];
50
	if (envToken) {
51
		return envToken;
52
	}
53
	try {
54
		return execSync('gh auth token', { encoding: 'utf-8' }).trim();
55
	} catch {
56
		throw new Error('No GITHUB_TOKEN set and `gh auth token` failed. Run `gh auth login` first.');
57
	}
58
}
59

60
/** Create a session using the real copilot provider, authenticate, subscribe, and return the session URI. */
61
async function createRealSession(c: TestProtocolClient, clientId: string, trackingList: string[], workingDirectory?: string): Promise<string> {
62
	const result = await createRealSessionFull(c, clientId, trackingList, workingDirectory);
63
	return result.sessionUri;
64
}
65

66
interface IRealSessionResult {
67
	sessionUri: string;
68
	addedNotification: SessionAddedNotification;
69
	subscribeSnapshot: SessionState;
70
}
71

72
/** Full version that returns the sessionAdded notification and subscribe snapshot for assertions. */
73
async function createRealSessionFull(c: TestProtocolClient, clientId: string, trackingList: string[], workingDirectory?: string): Promise<IRealSessionResult> {
74
	await c.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId }, 30_000);
75

76
	await c.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() }, 30_000);
77

78
	const sessionUri = URI.from({ scheme: 'copilotcli', path: `/real-test-${Date.now()}` }).toString();
79
	await c.call('createSession', { session: sessionUri, provider: 'copilotcli', workingDirectory }, 30_000);
80

81
	const notif = await c.waitForNotification(n =>
82
		n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',
83
		15_000,
84
	);
85
	const addedNotification = (notif.params as INotificationBroadcastParams).notification as SessionAddedNotification;
86
	const realSessionUri = addedNotification.summary.resource;
87
	trackingList.push(realSessionUri);
88

89
	const subscribeResult = await c.call<SubscribeResult>('subscribe', { resource: realSessionUri });
90
	const subscribeSnapshot = subscribeResult.snapshot.state as SessionState;
91
	c.clearReceived();
92

93
	return { sessionUri: realSessionUri, addedNotification, subscribeSnapshot };
94
}
95

96
/** Dispatch a turn with the given user message text. */
97
function dispatchTurn(c: TestProtocolClient, session: string, turnId: string, text: string, clientSeq: number): void {
98
	c.notify('dispatchAction', {
99
		clientSeq,
100
		action: {
101
			type: 'session/turnStarted',
102
			session,
103
			turnId,
104
			userMessage: { text },
105
		},
106
	});
107
}
108

109
function getAcceptedAnswers(request: SessionInputRequest): Record<string, SessionInputAnswer> | undefined {
110
	if (!request.questions?.length) {
111
		return undefined;
112
	}
113

114
	return Object.fromEntries(request.questions.map(question => {
115
		switch (question.kind) {
116
			case SessionInputQuestionKind.Text:
117
				return [question.id, {
118
					state: SessionInputAnswerState.Submitted,
119
					value: {
120
						kind: SessionInputAnswerValueKind.Text,
121
						value: question.defaultValue ?? 'interactive',
122
					},
123
				} satisfies SessionInputAnswer];
124
			case SessionInputQuestionKind.Number:
125
			case SessionInputQuestionKind.Integer:
126
				return [question.id, {
127
					state: SessionInputAnswerState.Submitted,
128
					value: {
129
						kind: SessionInputAnswerValueKind.Number,
130
						value: question.defaultValue ?? question.min ?? 1,
131
					},
132
				} satisfies SessionInputAnswer];
133
			case SessionInputQuestionKind.Boolean:
134
				return [question.id, {
135
					state: SessionInputAnswerState.Submitted,
136
					value: {
137
						kind: SessionInputAnswerValueKind.Boolean,
138
						value: question.defaultValue ?? true,
139
					},
140
				} satisfies SessionInputAnswer];
141
			case SessionInputQuestionKind.SingleSelect: {
142
				const preferredOption = question.options.find(option => /interactive/i.test(option.id) || /interactive/i.test(option.label))
143
					?? question.options.find(option => option.recommended)
144
					?? question.options[0];
145
				return [question.id, {
146
					state: SessionInputAnswerState.Submitted,
147
					value: {
148
						kind: SessionInputAnswerValueKind.Selected,
149
						value: preferredOption.id,
150
					},
151
				} satisfies SessionInputAnswer];
152
			}
153
			case SessionInputQuestionKind.MultiSelect: {
154
				const preferredOptions = question.options.filter(option => option.recommended);
155
				const selectedOptions = preferredOptions.length > 0 ? preferredOptions : question.options.slice(0, 1);
156
				return [question.id, {
157
					state: SessionInputAnswerState.Submitted,
158
					value: {
159
						kind: SessionInputAnswerValueKind.SelectedMany,
160
						value: selectedOptions.map(option => option.id),
161
					},
162
				} satisfies SessionInputAnswer];
163
			}
164
		}
165
	}));
166
}
167

168
function getMarkdownResponseText(c: TestProtocolClient): string {
169
	// Markdown content arrives as a `session/responsePart` action that opens
170
	// the part with the first chunk, followed by `session/delta` actions
171
	// appending subsequent chunks. Concatenate both to get the full text.
172
	const markdownPartIds = new Set<string>();
173
	const pieces: string[] = [];
174
	for (const notification of c.receivedNotifications(n =>
175
		isActionNotification(n, 'session/responsePart') || isActionNotification(n, 'session/delta')
176
	)) {
177
		const action = getActionEnvelope(notification).action;
178
		if (action.type === 'session/responsePart' && action.part.kind === ResponsePartKind.Markdown) {
179
			markdownPartIds.add(action.part.id);
180
			pieces.push(action.part.content);
181
		} else if (action.type === 'session/delta' && markdownPartIds.has(action.partId)) {
182
			pieces.push(action.content);
183
		}
184
	}
185
	return pieces.join('');
186
}
187

188
interface IDrivenTurnResult {
189
	sawInputRequest: boolean;
190
	sawPendingConfirmation: boolean;
191
	responseText: string;
192
}
193

194
async function driveTurnToCompletion(c: TestProtocolClient, session: string, turnId: string, text: string, clientSeq: number): Promise<IDrivenTurnResult> {
195
	c.clearReceived();
196
	dispatchTurn(c, session, turnId, text, clientSeq);
197

198
	const seenNotifications = new Set<object>();
199
	let nextClientSeq = clientSeq + 1;
200
	let sawInputRequest = false;
201
	let sawPendingConfirmation = false;
202

203
	while (true) {
204
		const notification = await c.waitForNotification(n => !seenNotifications.has(n as object) && (
205
			isActionNotification(n, 'session/toolCallReady')
206
			|| isActionNotification(n, 'session/inputRequested')
207
			|| isActionNotification(n, 'session/turnComplete')
208
			|| isActionNotification(n, 'session/error')
209
		), 90_000);
210
		seenNotifications.add(notification as object);
211

212
		if (isActionNotification(notification, 'session/error')) {
213
			throw new Error(`Session error while driving ${turnId}`);
214
		}
215

216
		if (isActionNotification(notification, 'session/toolCallReady')) {
217
			const action = getActionEnvelope(notification).action as SessionToolCallReadyAction;
218
			if (!action.confirmed) {
219
				sawPendingConfirmation = true;
220
				c.notify('dispatchAction', {
221
					clientSeq: nextClientSeq++,
222
					action: {
223
						type: 'session/toolCallConfirmed',
224
						session,
225
						turnId,
226
						toolCallId: action.toolCallId,
227
						approved: true,
228
					},
229
				});
230
			}
231
			continue;
232
		}
233

234
		if (isActionNotification(notification, 'session/inputRequested')) {
235
			sawInputRequest = true;
236
			const action = getActionEnvelope(notification).action as SessionInputRequestedAction;
237
			c.notify('dispatchAction', {
238
				clientSeq: nextClientSeq++,
239
				action: {
240
					type: 'session/inputCompleted',
241
					session,
242
					requestId: action.request.id,
243
					response: SessionInputResponseKind.Accept,
244
					answers: getAcceptedAnswers(action.request),
245
				},
246
			});
247
			continue;
248
		}
249

250
		break;
251
	}
252

253
	return {
254
		sawInputRequest,
255
		sawPendingConfirmation,
256
		responseText: getMarkdownResponseText(c),
257
	};
258
}
259

260
function terminalResourceFromContent(content: readonly ToolResultContent[]): string | undefined {
261
	const terminalContent = content.find(c => c.type === ToolResultContentType.Terminal);
262
	return terminalContent?.resource;
263
}
264

265
function terminalText(state: TerminalState): string {
266
	return removeAnsiEscapeCodes(state.content.map(part => part.type === 'command' ? `${part.commandLine}\n${part.output}` : part.value).join(''));
267
}
268

269
/** Looks up the toolName for a toolCallReady by joining against the matching toolCallStart. */
270
function findToolNameForCall(c: TestProtocolClient, toolCallId: string): string | undefined {
271
	return c.receivedNotifications(n => isActionNotification(n, 'session/toolCallStart'))
272
		.map(n => getActionEnvelope(n).action as SessionToolCallStartAction)
273
		.find(a => a.toolCallId === toolCallId)?.toolName;
274
}
275

276
interface IApprovalRule {
277
	/** Tool name this rule applies to (e.g. `'bash'`, `'write_bash'`). */
278
	toolName: string;
279
	/** Optional predicate over the tool input. If omitted, any input matches. */
280
	matchInput?: (toolInput: string | undefined) => boolean;
281
	/**
282
	 * Optional inspector run for every matched call before approval.
283
	 * Push assertion failure messages onto `errors` to fail the test.
284
	 */
285
	inspect?: (info: {
286
		action: SessionToolCallReadyAction;
287
		errors: string[];
288
	}) => void;
289
}
290

291
interface IBackgroundApprovalLoopOptions {
292
	/** Starting clientSeq for dispatched toolCallConfirmed actions. Avoids collisions with the test's own dispatches. */
293
	approvalSeqStart: number;
294
	/**
295
	 * Allow-list of tool calls the loop is permitted to auto-approve. Each
296
	 * pending confirmation must match exactly one rule (by `toolName` plus
297
	 * optional `matchInput` predicate). Calls that don't match are recorded
298
	 * as errors and denied — the loop refuses to rubber-stamp anything the
299
	 * test didn't anticipate (e.g. an unexpected `rm` from the model).
300
	 */
301
	allow: readonly IApprovalRule[];
302
}
303

304
interface IBackgroundApprovalLoop {
305
	/** Errors collected during the run (unmatched tool calls + inspector failures). */
306
	readonly errors: readonly string[];
307
	/** Tool names that were observed and approved at least once. */
308
	readonly approvedToolNames: ReadonlySet<string>;
309
	/**
310
	 * Tool names for every permission request observed by the loop, regardless
311
	 * of whether they matched the allow-list. Useful for asserting that a
312
	 * tool with `skipPermission: true` never triggered a permission flow.
313
	 */
314
	readonly observedToolNames: ReadonlySet<string>;
315
	/** Stops the loop and waits for it to drain. */
316
	stop(): Promise<void>;
317
}
318

319
/**
320
 * Starts a background loop that auto-approves pending tool call confirmations
321
 * during a real-SDK turn, but only if they match the supplied allow-list.
322
 * Anything outside the allow-list is denied and recorded as an error so the
323
 * test fails loudly instead of silently approving model-chosen tool calls.
324
 *
325
 * Implementation note: `waitForNotification` does NOT consume notifications from
326
 * the client's queue, so we dedupe by `serverSeq`.
327
 */
328
function startBackgroundApprovalLoop(c: TestProtocolClient, options: IBackgroundApprovalLoopOptions): IBackgroundApprovalLoop {
329
	const errors: string[] = [];
330
	const approvedToolNames = new Set<string>();
331
	const observedToolNames = new Set<string>();
332
	const processedSeqs = new Set<number>();
333
	let active = true;
334
	let approvalSeq = options.approvalSeqStart;
335

336
	const loop = (async () => {
337
		while (active) {
338
			try {
339
				const ready = await c.waitForNotification(n => {
340
					if (!isActionNotification(n, 'session/toolCallReady')) {
341
						return false;
342
					}
343
					return !processedSeqs.has(getActionEnvelope(n).serverSeq);
344
				}, 2_000);
345
				const envelope = getActionEnvelope(ready);
346
				processedSeqs.add(envelope.serverSeq);
347
				const action = envelope.action as SessionToolCallReadyAction & { session: string; turnId: string };
348
				if (action.confirmed) {
349
					continue;
350
				}
351

352
				const toolName = findToolNameForCall(c, action.toolCallId);
353
				if (toolName) {
354
					observedToolNames.add(toolName);
355
				}
356
				const matchingRule = options.allow.find(rule =>
357
					rule.toolName === toolName
358
					&& (rule.matchInput?.(action.toolInput) ?? true));
359

360
				if (!matchingRule) {
361
					errors.push(`unexpected tool call: toolName=${toolName ?? '<unknown>'} input=${JSON.stringify(action.toolInput)}`);
362
					c.notify('dispatchAction', {
363
						clientSeq: ++approvalSeq,
364
						action: {
365
							type: 'session/toolCallConfirmed',
366
							session: action.session,
367
							turnId: action.turnId,
368
							toolCallId: action.toolCallId,
369
							approved: false,
370
						},
371
					});
372
					continue;
373
				}
374

375
				matchingRule.inspect?.({ action, errors });
376
				approvedToolNames.add(matchingRule.toolName);
377

378
				c.notify('dispatchAction', {
379
					clientSeq: ++approvalSeq,
380
					action: {
381
						type: 'session/toolCallConfirmed',
382
						session: action.session,
383
						turnId: action.turnId,
384
						toolCallId: action.toolCallId,
385
						approved: true,
386
					},
387
				});
388
			} catch (e) {
389
				// Only ignore the expected 2-second poll timeout. Any other error
390
				// (e.g. 'Client closed', exception from matchingRule.inspect) is a
391
				// real failure — record it so the test fails deterministically.
392
				const msg = e instanceof Error ? e.message : String(e);
393
				if (!msg.includes('Timed out') && !msg.includes('timed out')) {
394
					errors.push(`approval loop error: ${msg}`);
395
					active = false;
396
				}
397
			}
398
		}
399
	})();
400

401
	return {
402
		errors,
403
		approvedToolNames,
404
		observedToolNames,
405
		async stop(): Promise<void> {
406
			active = false;
407
			await loop;
408
		},
409
	};
410
}
411

412
(REAL_SDK_ENABLED ? suite : suite.skip)('Protocol WebSocket — Real Copilot SDK', function () {
413

414
	let server: IServerHandle;
415
	let client: TestProtocolClient;
416
	/** Session URIs created during the current test, disposed in teardown. */
417
	const createdSessions: string[] = [];
418
	/** Temp directories created during the current test, removed in teardown. */
419
	const tempDirs: string[] = [];
420

421
	suiteSetup(async function () {
422
		this.timeout(60_000);
423
		server = await startRealServer();
424
	});
425

426
	suiteTeardown(function () {
427
		server?.process.kill();
428
	});
429

430
	setup(async function () {
431
		this.timeout(30_000);
432
		client = new TestProtocolClient(server.port);
433
		await client.connect();
434
	});
435

436
	teardown(async function () {
437
		// Dispose all sessions created during this test
438
		for (const session of createdSessions) {
439
			try {
440
				await client.call('disposeSession', { session }, 5000);
441
			} catch {
442
				// Best-effort cleanup — the session may already be gone
443
			}
444
		}
445
		createdSessions.length = 0;
446
		client.close();
447

448
		// Remove temp directories created during this test. On Windows the
449
		// agent subprocess can still hold handles to the working directory for
450
		// a brief moment after `disposeSession` returns, which surfaces as
451
		// EBUSY. Retry a few times to give the OS a chance to release the
452
		// handle before failing the teardown.
453
		for (const dir of tempDirs) {
454
			try {
455
				rmSync(dir, { recursive: true, force: true, maxRetries: 5, retryDelay: 200 });
456
			} catch {
457
				// Best-effort cleanup — leftover temp dirs in os.tmpdir() are
458
				// harmless and shouldn't fail an otherwise passing test.
459
			}
460
		}
461
		tempDirs.length = 0;
462
	});
463

464
	// ---- Basic turn execution ------------------------------------------------
465

466
	test('sends a simple message and receives a response', async function () {
467
		this.timeout(120_000);
468

469
		const sessionUri = await createRealSession(client, 'real-sdk-simple', createdSessions, URI.file(tmpdir()).toString());
470
		dispatchTurn(client, sessionUri, 'turn-1', 'Say exactly "hello" and nothing else', 1);
471

472
		// Wait for the turn to complete — the real SDK may take a while
473
		await client.waitForNotification(n => isActionNotification(n, 'session/turnComplete'), 90_000);
474

475
		// Verify we received at least one response part
476
		const responseParts = client.receivedNotifications(n => isActionNotification(n, 'session/responsePart'));
477
		assert.ok(responseParts.length > 0, 'should have received at least one response part');
478
	});
479

480
	// ---- Tool call with permission flow -------------------------------------
481

482
	test('tool call triggers permission request and can be approved', async function () {
483
		this.timeout(120_000);
484

485
		const tempDir = mkdtempSync(`${tmpdir()}/ahp-perm-test-`);
486
		tempDirs.push(tempDir);
487
		const sessionUri = await createRealSession(client, 'real-sdk-permission', createdSessions, URI.file(tempDir).toString());
488
		dispatchTurn(client, sessionUri, 'turn-perm', 'Run the shell command: echo "hello from test"', 1);
489

490
		// The real SDK should fire a tool call that needs permission
491
		const toolStartNotif = await client.waitForNotification(
492
			n => isActionNotification(n, 'session/toolCallStart'),
493
			60_000,
494
		);
495
		const toolStartAction = getActionEnvelope(toolStartNotif).action as { toolCallId: string };
496

497
		// Wait for toolCallReady (pending confirmation)
498
		const toolReadyNotif = await client.waitForNotification(
499
			n => isActionNotification(n, 'session/toolCallReady'),
500
			30_000,
501
		);
502
		const toolReadyAction = getActionEnvelope(toolReadyNotif).action as { toolCallId: string; confirmed?: string };
503

504
		// If the tool was auto-approved, confirmed will be set; if pending, confirm it
505
		if (!toolReadyAction.confirmed) {
506
			client.notify('dispatchAction', {
507
				clientSeq: 2,
508
				action: {
509
					type: 'session/toolCallConfirmed',
510
					session: sessionUri,
511
					turnId: 'turn-perm',
512
					toolCallId: toolStartAction.toolCallId,
513
					approved: true,
514
				},
515
			});
516
		}
517

518
		// Wait for the turn to complete
519
		await client.waitForNotification(n => isActionNotification(n, 'session/turnComplete'), 90_000);
520
	});
521

522
	test('planning-mode session-state writes are auto-approved in default mode', async function () {
523
		this.timeout(180_000);
524

525
		const tempDir = mkdtempSync(`${tmpdir()}/ahp-plan-test-`);
526
		tempDirs.push(tempDir);
527
		const sessionUri = await createRealSession(client, 'real-sdk-plan-mode', createdSessions, URI.file(tempDir).toString());
528

529
		// Switch the session into plan mode via the standard config-change flow
530
		// before sending the first turn. The agent host reads this value at
531
		// turn-start time and pushes it to the SDK via `rpc.mode.set`.
532
		client.notify('dispatchAction', {
533
			clientSeq: 1,
534
			action: {
535
				type: 'session/configChanged',
536
				session: sessionUri,
537
				config: { mode: 'plan' },
538
			},
539
		});
540
		await client.waitForNotification(n => isActionNotification(n, 'session/configChanged'));
541

542
		const planTurn = await driveTurnToCompletion(client, sessionUri, 'turn-plan',
543
			'Help me implement a Python script that prints "hello world" to stdout. Write the shortest possible plan to your session plan.md and use the exit_plan_mode tool to ask me to approve it before writing any code.', 2);
544
		assert.strictEqual(planTurn.sawPendingConfirmation, false, 'should not have received pending-confirmation toolCallReady while writing session-state plan.md');
545
		assert.ok(planTurn.sawInputRequest, 'should reach the exit_plan_mode question so the test can continue the same session');
546

547
		const extraSessionNotificationsAfterPlan = client.receivedNotifications(n =>
548
			n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',
549
		);
550
		assert.strictEqual(extraSessionNotificationsAfterPlan.length, 0, 'should not create a second session while answering the plan-mode question');
551

552
		// Mirror what a real UI client would do after the user accepted the
553
		// plan: update the session config so subsequent turns no longer run
554
		// in plan mode. Without this the agent host would re-set the SDK's
555
		// mode to 'plan' at the next send because the session config still
556
		// holds the original 'plan' value.
557
		client.notify('dispatchAction', {
558
			clientSeq: 50,
559
			action: {
560
				type: 'session/configChanged',
561
				session: sessionUri,
562
				config: { mode: 'interactive' },
563
			},
564
		});
565
		await client.waitForNotification(n => isActionNotification(n, 'session/configChanged'));
566

567
		const followupTurn = await driveTurnToCompletion(client, sessionUri, 'turn-followup',
568
			'What did the plan I just approved say to print? Reply with exactly "hello world".', 100,
569
		);
570
		assert.strictEqual(followupTurn.sawPendingConfirmation, false, 'follow-up turn should not surface new pending confirmations');
571
		assert.match(followupTurn.responseText, /hello world/i, 'follow-up turn should retain the original plan context');
572

573
		const extraSessionNotificationsAfterFollowup = client.receivedNotifications(n =>
574
			n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',
575
		);
576
		assert.strictEqual(extraSessionNotificationsAfterFollowup.length, 0, 'sending another message should stay on the same session instead of forking');
577

578
		const resubscribeResult = await client.call<SubscribeResult>('subscribe', { resource: sessionUri });
579
		const finalSnapshot = resubscribeResult.snapshot.state as SessionState;
580
		assert.strictEqual(finalSnapshot.summary.resource, sessionUri, 'follow-up turn should keep the original session resource');
581
	});
582

583
	// ---- Abort / cancel -----------------------------------------------------
584

585
	test('can abort a running turn', async function () {
586
		this.timeout(120_000);
587

588
		const sessionUri = await createRealSession(client, 'real-sdk-abort', createdSessions, URI.file(tmpdir()).toString());
589
		dispatchTurn(client, sessionUri, 'turn-abort', 'Write a very long essay about the history of computing', 1);
590

591
		// Wait a moment for the turn to start processing, then abort
592
		await client.waitForNotification(
593
			n => isActionNotification(n, 'session/responsePart') || isActionNotification(n, 'session/toolCallStart'),
594
			60_000,
595
		);
596

597
		client.notify('dispatchAction', {
598
			clientSeq: 2,
599
			action: {
600
				type: 'session/abortTurn',
601
				session: sessionUri,
602
			},
603
		});
604

605
		// Verify the abort action was echoed back by the server.
606
		// We don't wait for turnComplete because the real Copilot SDK may
607
		// continue streaming after abort and the turn may not terminate within
608
		// the test timeout.
609
		await client.waitForNotification(
610
			n => isActionNotification(n, 'session/abortTurn'),
611
			10_000,
612
		);
613
	});
614

615
	// ---- Working directory correctness --------------------------------------
616

617
	test('session is created with the correct working directory', async function () {
618
		this.timeout(120_000);
619

620
		// Use a real temp directory so the path exists on disk.
621
		// Clean it up at the end to avoid leaving test artifacts.
622
		const tempDir = mkdtempSync(`${tmpdir()}/ahp-test-`);
623
		tempDirs.push(tempDir);
624
		const workingDirUri = URI.file(tempDir).toString();
625

626
		await client.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId: 'real-sdk-workdir' });
627
		await client.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() });
628

629
		const sessionUri = URI.from({ scheme: 'copilotcli', path: `/real-test-wd-${Date.now()}` }).toString();
630
		await client.call('createSession', { session: sessionUri, provider: 'copilotcli', workingDirectory: workingDirUri });
631

632
		// 1. Verify workingDirectory in the sessionAdded notification
633
		const addedNotif = await client.waitForNotification(n =>
634
			n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',
635
			15_000,
636
		);
637
		const addedSummary = ((addedNotif.params as INotificationBroadcastParams).notification as SessionAddedNotification).summary;
638
		createdSessions.push(addedSummary.resource);
639
		assert.strictEqual(
640
			addedSummary.workingDirectory,
641
			workingDirUri,
642
			`sessionAdded notification should carry the requested working directory`,
643
		);
644

645
		// 2. Subscribe and verify workingDirectory in the session state snapshot
646
		const subscribeResult = await client.call<SubscribeResult>('subscribe', { resource: addedSummary.resource });
647
		const sessionState = subscribeResult.snapshot.state as SessionState;
648
		assert.strictEqual(
649
			sessionState.summary.workingDirectory,
650
			workingDirUri,
651
			`subscribe snapshot summary should carry the requested working directory`,
652
		);
653
	});
654

655
	// ---- Worktree isolation -------------------------------------------------
656

657
	test('worktree session uses the resolved worktree as working directory', async function () {
658
		this.timeout(120_000);
659

660
		// Set up a minimal git repo so the server can create a worktree
661
		const tempDir = mkdtempSync(`${tmpdir()}/ahp-wt-test-`);
662
		tempDirs.push(tempDir, `${tempDir}.worktrees`);
663
		execSync('git init', { cwd: tempDir });
664
		execSync('git config user.name "Agent Host Test"', { cwd: tempDir });
665
		execSync('git config user.email "[email protected]"', { cwd: tempDir });
666
		execSync('git commit --allow-empty -m "init"', { cwd: tempDir });
667
		const defaultBranch = execSync('git branch --show-current', { cwd: tempDir, encoding: 'utf-8' }).trim();
668
		const workingDirUri = URI.file(tempDir).toString();
669

670
		await client.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId: 'real-sdk-worktree' });
671
		await client.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() });
672

673
		const sessionUri = URI.from({ scheme: 'copilotcli', path: `/real-test-wt-${Date.now()}` }).toString();
674
		await client.call('createSession', {
675
			session: sessionUri,
676
			provider: 'copilotcli',
677
			workingDirectory: workingDirUri,
678
			config: { isolation: 'worktree', branch: defaultBranch },
679
		});
680

681
		const addedNotif = await client.waitForNotification(n =>
682
			n.method === 'notification' && (n.params as INotificationBroadcastParams).notification.type === 'notify/sessionAdded',
683
			15_000,
684
		);
685
		const addedSummary = ((addedNotif.params as INotificationBroadcastParams).notification as SessionAddedNotification).summary;
686
		createdSessions.push(addedSummary.resource);
687

688
		// Subscribe so we receive action broadcasts for this session
689
		await client.call<SubscribeResult>('subscribe', { resource: addedSummary.resource });
690

691
		// Verify the worktree path is in the summary
692
		assert.ok(
693
			addedSummary.workingDirectory,
694
			'sessionAdded notification should have a workingDirectory',
695
		);
696
		assert.ok(
697
			addedSummary.workingDirectory!.includes('.worktrees'),
698
			`workingDirectory should be under the .worktrees folder, got: ${addedSummary.workingDirectory}`,
699
		);
700
		const resolvedWorkingDirectoryPath = URI.parse(addedSummary.workingDirectory!).fsPath;
701

702
		// Set the active client with tools (matching real VS Code flow where
703
		// activeClientChanged is dispatched AFTER createSession). When the next
704
		// sendMessage detects the tools changed vs the session's creation-time
705
		// snapshot, it disposes the SDK session and re-creates it via
706
		// _resumeSession. That resume path must use the worktree working
707
		// directory, not the original repo path.
708
		client.notify('dispatchAction', {
709
			clientSeq: 1,
710
			action: {
711
				type: 'session/activeClientChanged',
712
				session: addedSummary.resource,
713
				activeClient: {
714
					clientId: 'real-sdk-worktree',
715
					displayName: 'Test Client',
716
					tools: [
717
						{
718
							name: 'test_echo',
719
							description: 'A harmless echo tool for testing',
720
							inputSchema: { type: 'object', properties: { message: { type: 'string' } } },
721
						},
722
					],
723
				},
724
			},
725
		});
726

727
		// Send a turn — this triggers sendMessage, which will detect the tools
728
		// changed and refresh the session (dispose + _resumeSession). The
729
		// resumed session should still have the worktree as its working
730
		// directory. Ask a safe, read-only question about the working directory.
731
		client.clearReceived();
732
		dispatchTurn(client, addedSummary.resource, 'turn-wt',
733
			'What is your current working directory? Reply with just the absolute path and nothing else.', 2);
734

735
		// Wait for the turn to complete or error
736
		await client.waitForNotification(
737
			n => isActionNotification(n, 'session/turnComplete') || isActionNotification(n, 'session/error'),
738
			90_000,
739
		);
740

741
		// The session refresh should succeed — if it errors with
742
		// "workingDirectory is required to resume", the worktree path was lost.
743
		const errors = client.receivedNotifications(n => isActionNotification(n, 'session/error'));
744
		assert.strictEqual(errors.length, 0,
745
			errors.length > 0
746
				? `Session error during turn (worktree path lost on resume): ${(getActionEnvelope(errors[0]).action as { error?: { message?: string } }).error?.message}`
747
				: '',
748
		);
749

750
		// Verify the turn got a response (the session resumed successfully)
751
		const responseParts = client.receivedNotifications(n => isActionNotification(n, 'session/responsePart'));
752
		assert.ok(responseParts.length > 0, 'should have received at least one response part after session refresh');
753

754
		client.clearReceived();
755
		dispatchTurn(client, addedSummary.resource, 'turn-wt-terminal', 'Run the shell command: pwd', 3);
756

757
		const toolStartNotif = await client.waitForNotification(
758
			n => isActionNotification(n, 'session/toolCallStart'),
759
			60_000,
760
		);
761
		const toolStartAction = getActionEnvelope(toolStartNotif).action as { toolCallId: string };
762

763
		const toolReadyNotif = await client.waitForNotification(
764
			n => isActionNotification(n, 'session/toolCallReady'),
765
			30_000,
766
		);
767
		const toolReadyAction = getActionEnvelope(toolReadyNotif).action as { confirmed?: string };
768
		if (!toolReadyAction.confirmed) {
769
			client.notify('dispatchAction', {
770
				clientSeq: 4,
771
				action: {
772
					type: 'session/toolCallConfirmed',
773
					session: addedSummary.resource,
774
					turnId: 'turn-wt-terminal',
775
					toolCallId: toolStartAction.toolCallId,
776
					approved: true,
777
				},
778
			});
779
		}
780

781
		const terminalContentNotif = await client.waitForNotification(n => {
782
			if (!isActionNotification(n, 'session/toolCallContentChanged')) {
783
				return false;
784
			}
785
			const action = getActionEnvelope(n).action as { toolCallId: string; content: readonly ToolResultContent[] };
786
			return action.toolCallId === toolStartAction.toolCallId && terminalResourceFromContent(action.content) !== undefined;
787
		}, 30_000);
788
		const terminalContentAction = getActionEnvelope(terminalContentNotif).action as { content: readonly ToolResultContent[] };
789
		const terminalUri = terminalResourceFromContent(terminalContentAction.content);
790
		assert.ok(terminalUri, 'shell tool should expose its terminal resource');
791

792
		const terminalSubscribeResult = await client.call<SubscribeResult>('subscribe', { resource: terminalUri });
793
		const initialTerminalState = terminalSubscribeResult.snapshot.state as TerminalState;
794
		assert.strictEqual(initialTerminalState.cwd, resolvedWorkingDirectoryPath, 'terminal should be created in the resolved worktree directory');
795

796
		await client.waitForNotification(n => isActionNotification(n, 'session/turnComplete'), 90_000);
797
		const terminalSnapshot = await client.call<SubscribeResult>('subscribe', { resource: terminalUri });
798
		const terminalState = terminalSnapshot.snapshot.state as TerminalState;
799
		assert.ok(terminalText(terminalState).includes(resolvedWorkingDirectoryPath), `pwd output should include the resolved worktree path ${resolvedWorkingDirectoryPath}`);
800
	});
801

802
	// ---- Subagent tool call grouping ----------------------------------------
803

804
	test('subagent tool calls are routed to the subagent session, not flat in the parent', async function () {
805
		this.timeout(180_000);
806

807
		// Set up a small fixture directory so the subagent has something to view.
808
		const tempDir = mkdtempSync(`${tmpdir()}/ahp-subagent-test-`);
809
		tempDirs.push(tempDir);
810
		writeFileSync(`${tempDir}/file-a.txt`, 'alpha');
811
		writeFileSync(`${tempDir}/file-b.txt`, 'beta');
812

813
		const sessionUri = await createRealSession(client, 'real-sdk-subagent', createdSessions, URI.file(tempDir).toString());
814

815
		// Auto-approve every tool that needs confirmation while the turn runs.
816
		// Multiple inner tool calls may need approval; doing this in a background
817
		// loop keeps the turn unblocked. Track processed serverSeqs so we don't
818
		// busy-spin on already-handled notifications (waitForNotification returns
819
		// matching notifications from the queue without consuming them). Using
820
		// serverSeq rather than toolCallId allows the same tool to be legitimately
821
		// re-confirmed in a later notification.
822
		let approvalsActive = true;
823
		let approvalSeq = 1000;
824
		const processedSeqs = new Set<number>();
825
		const approvalLoop = (async () => {
826
			while (approvalsActive) {
827
				try {
828
					const ready = await client.waitForNotification(n => {
829
						if (!isActionNotification(n, 'session/toolCallReady')) {
830
							return false;
831
						}
832
						const envelope = getActionEnvelope(n);
833
						const a = envelope.action as { confirmed?: string };
834
						return !a.confirmed && !processedSeqs.has(envelope.serverSeq);
835
					}, 2_000);
836
					const envelope = getActionEnvelope(ready);
837
					if (!processedSeqs.has(envelope.serverSeq)) {
838
						processedSeqs.add(envelope.serverSeq);
839
						const action = envelope.action as { session: string; turnId: string; toolCallId: string; confirmed?: string };
840
						if (!action.confirmed) {
841
							client.notify('dispatchAction', {
842
								clientSeq: ++approvalSeq,
843
								action: {
844
									type: 'session/toolCallConfirmed',
845
									session: action.session,
846
									turnId: action.turnId,
847
									toolCallId: action.toolCallId,
848
									approved: true,
849
								},
850
							});
851
						}
852
					}
853
				} catch {
854
					// Timeout — re-poll. Loop exits when approvalsActive flips.
855
				}
856
			}
857
		})();
858

859
		// Encourage the model to delegate via the `task` subagent tool. The exact
860
		// behaviour is non-deterministic — if the model declines we fail the test
861
		// with a clear message rather than silently passing.
862
		dispatchTurn(client, sessionUri, 'turn-sa',
863
			'Use the `task` tool to spawn a subagent to list the files in the current working directory. ' +
864
			'The subagent should call a single read-only tool (e.g. `view` or `bash` with `ls`) to enumerate the directory. ' +
865
			'Do not enumerate the directory yourself — delegate to the subagent.',
866
			1);
867

868
		// Wait for the parent's `task` tool call to expose a Subagent content
869
		// block carrying the subagent session URI.
870
		const subagentContentNotif = await client.waitForNotification(n => {
871
			if (!isActionNotification(n, 'session/toolCallContentChanged')) {
872
				return false;
873
			}
874
			const action = getActionEnvelope(n).action as { session: string; content: readonly ToolResultContent[] };
875
			return action.session === sessionUri && action.content.some(c => c.type === ToolResultContentType.Subagent);
876
		}, 120_000);
877

878
		const parentContent = (getActionEnvelope(subagentContentNotif).action as { content: readonly ToolResultContent[] }).content;
879
		const subagentRef = parentContent.find((c): c is ToolResultSubagentContent => c.type === ToolResultContentType.Subagent)!;
880
		const subagentSessionUri = subagentRef.resource as unknown as string;
881
		assert.ok(typeof subagentSessionUri === 'string' && isSubagentSession(subagentSessionUri),
882
			`subagent session URI should be subagent-shaped, got: ${JSON.stringify(subagentSessionUri)}`);
883

884
		// Subscribe so we receive the subagent session's own action broadcasts.
885
		await client.call<SubscribeResult>('subscribe', { resource: subagentSessionUri });
886

887
		// Wait for the parent turn to complete (with a generous timeout — the
888
		// subagent's turn must finish first).
889
		await client.waitForNotification(n => {
890
			if (!isActionNotification(n, 'session/turnComplete')) {
891
				return false;
892
			}
893
			return (getActionEnvelope(n).action as { session: string }).session === sessionUri;
894
		}, 150_000);
895

896
		approvalsActive = false;
897
		await approvalLoop;
898

899
		// Group all received toolCallStart actions by the session they target.
900
		// This is the bug's signature: when inner tool_start arrives before
901
		// subagent_started, the inner tool calls leak into the parent session.
902
		const toolStarts = client.receivedNotifications(n => isActionNotification(n, 'session/toolCallStart'))
903
			.map(n => getActionEnvelope(n).action as SessionToolCallStartAction);
904

905
		const parentStarts = toolStarts.filter(a => (a.session as unknown as string) === sessionUri);
906
		const subagentStarts = toolStarts.filter(a => (a.session as unknown as string) === subagentSessionUri);
907

908
		// Parent should only carry the outer `task` tool call. Any other
909
		// tool call on the parent indicates the inner-tool routing bug.
910
		const parentNonTaskStarts = parentStarts.filter(a => a.toolName !== 'task');
911
		assert.deepStrictEqual(
912
			parentNonTaskStarts.map(a => a.toolName),
913
			[],
914
			`parent session should not contain inner tool calls; found: ${JSON.stringify(parentNonTaskStarts.map(a => a.toolName))}`,
915
		);
916

917
		// Subagent session must have at least one inner tool call. If this
918
		// fails, the subagent never actually executed any work — likely the
919
		// model didn't delegate as instructed.
920
		assert.ok(subagentStarts.length >= 1,
921
			`subagent session should contain at least one inner tool call, got ${subagentStarts.length}. ` +
922
			`Parent tool calls: ${JSON.stringify(parentStarts.map(a => a.toolName))}`);
923
	});
924

925
	// ---- Model discovery -----------------------------------------------------
926

927
	test('listModels returns well-shaped model entries after authenticate', async function () {
928
		this.timeout(60_000);
929

930
		await client.call('initialize', { protocolVersion: PROTOCOL_VERSION, clientId: 'real-sdk-list-models' }, 30_000);
931

932
		// Subscribe to root state *before* authenticating so we can observe
933
		// the agentsChanged action that carries the populated model list.
934
		const rootResult = await client.call<SubscribeResult>('subscribe', { resource: ROOT_STATE_URI }, 30_000);
935
		const initial = rootResult.snapshot.state as RootState;
936
		const copilotAgent = initial.agents.find(a => a.provider === 'copilotcli');
937
		assert.ok(copilotAgent, `Expected copilotcli agent in root state, got: ${initial.agents.map(a => a.provider).join(', ')}`);
938

939
		await client.call('authenticate', { resource: 'https://api.github.com', token: resolveGitHubToken() }, 30_000);
940

941
		// Models are loaded asynchronously after authenticate. Wait for the
942
		// agentsChanged action that populates them.
943
		const notif = await client.waitForNotification(n => {
944
			if (!isActionNotification(n, 'root/agentsChanged')) {
945
				return false;
946
			}
947
			const action = getActionEnvelope(n).action as RootAgentsChangedAction;
948
			const agent = action.agents.find(a => a.provider === 'copilotcli');
949
			return !!agent && agent.models.length > 0;
950
		}, 30_000);
951

952
		const action = getActionEnvelope(notif).action as RootAgentsChangedAction;
953
		const agent = action.agents.find(a => a.provider === 'copilotcli')!;
954

955
		assert.ok(agent.models.length > 0, 'Expected at least one model from listModels');
956

957
		// Assert every model has the shape CopilotAgent._listModels produces.
958
		// maxContextWindow is optional because synthetic SDK entries (e.g. the
959
		// `auto` router) ship with `capabilities: {}` and no fixed window.
960
		for (const model of agent.models) {
961
			assert.strictEqual(typeof model.id, 'string', `model.id should be a string: ${JSON.stringify(model)}`);
962
			assert.ok(model.id.length > 0, `model.id should be non-empty: ${JSON.stringify(model)}`);
963
			assert.strictEqual(typeof model.name, 'string', `model.name should be a string: ${JSON.stringify(model)}`);
964
			assert.strictEqual(model.provider, 'copilotcli', `model.provider should be copilotcli: ${JSON.stringify(model)}`);
965
			assert.ok(model.maxContextWindow === undefined || (typeof model.maxContextWindow === 'number' && model.maxContextWindow > 0),
966
				`model.maxContextWindow should be undefined or a positive number: ${JSON.stringify(model)}`);
967
			assert.ok(model.supportsVision === undefined || typeof model.supportsVision === 'boolean', `model.supportsVision should be boolean or undefined: ${JSON.stringify(model)}`);
968
		}
969

970
		// The `auto` synthetic router model should be present even though it
971
		// has no fixed context window.
972
		assert.ok(agent.models.some(m => m.id === 'auto'), `Expected 'auto' model in list, got: ${agent.models.map(m => m.id).join(', ')}`);
973
	});
974

975
	// ---- Redundant cd-prefix stripping --------------------------------------
976

977
	test('strips redundant `cd <workingDirectory> &&` prefix from shell tool calls', async function () {
978
		this.timeout(180_000);
979

980
		const tempDir = mkdtempSync(`${tmpdir()}/ahp-cd-strip-test-`);
981
		tempDirs.push(tempDir);
982
		const expectedWorkingDirPath = tempDir;
983
		const sessionUri = await createRealSession(client, 'real-sdk-cd-strip', createdSessions, URI.file(tempDir).toString());
984

985
		// Coax the model into producing a `cd <wd> && X` form. The exact text is
986
		// non-deterministic, so the test asserts on rewrite behavior conditional
987
		// on actually receiving a cd-prefixed command.
988
		client.clearReceived();
989
		dispatchTurn(client, sessionUri, 'turn-cd-strip',
990
			`Run this exact shell command, do not modify it: cd ${expectedWorkingDirPath} && echo strip-me-please`,
991
			1);
992

993
		// Wait for the toolCallReady action that carries the rewritten toolInput.
994
		const toolReadyNotif = await client.waitForNotification(n => {
995
			if (!isActionNotification(n, 'session/toolCallReady')) {
996
				return false;
997
			}
998
			const action = getActionEnvelope(n).action as { toolInput?: string };
999
			return typeof action.toolInput === 'string' && action.toolInput.includes('echo strip-me-please');
1000
		}, 90_000);
1001

1002
		const toolReadyAction = getActionEnvelope(toolReadyNotif).action as { toolCallId: string; toolInput?: string; confirmed?: string };
1003
		const toolInput = toolReadyAction.toolInput!;
1004

1005
		// The core assertion: regardless of whether the model emitted the cd
1006
		// prefix verbatim or already pre-stripped it, the toolInput surfaced to
1007
		// the client must NOT contain the redundant `cd <tempDir> &&` prefix.
1008
		// Use a regex that anchors to the start of the command and tolerates
1009
		// optional surrounding quotes around the directory plus either `&&`
1010
		// or `;` as the chain operator (so quoted variants like
1011
		// `cd "<wd>" && …` and pwsh-style `cd <wd>; …` are both detected).
1012
		const escapedWorkingDirPath = expectedWorkingDirPath.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
1013
		const redundantWorkingDirCdPrefix = new RegExp(
1014
			`^\\s*cd\\s+(?:"${escapedWorkingDirPath}"|'${escapedWorkingDirPath}'|${escapedWorkingDirPath})\\s*(?:&&|;)\\s*`,
1015
		);
1016
		assert.ok(
1017
			!redundantWorkingDirCdPrefix.test(toolInput),
1018
			`toolInput should not contain a redundant cd-prefix targeting the working directory; got: ${JSON.stringify(toolInput)}`,
1019
		);
1020
		assert.ok(
1021
			toolInput.includes('echo strip-me-please'),
1022
			`toolInput should contain the rewritten command body; got: ${JSON.stringify(toolInput)}`,
1023
		);
1024

1025
		// Approve so the turn can complete. If it was already auto-confirmed
1026
		// (`confirmed` is set), skip the manual approval.
1027
		if (!toolReadyAction.confirmed) {
1028
			client.notify('dispatchAction', {
1029
				clientSeq: 2,
1030
				action: {
1031
					type: 'session/toolCallConfirmed',
1032
					session: sessionUri,
1033
					turnId: 'turn-cd-strip',
1034
					toolCallId: toolReadyAction.toolCallId,
1035
					approved: true,
1036
				},
1037
			});
1038
		}
1039

1040
		// Drive any further confirmations to completion so teardown is clean.
1041
		while (true) {
1042
			const next = await client.waitForNotification(
1043
				n => isActionNotification(n, 'session/toolCallReady') || isActionNotification(n, 'session/turnComplete') || isActionNotification(n, 'session/error'),
1044
				90_000,
1045
			);
1046
			if (isActionNotification(next, 'session/turnComplete') || isActionNotification(next, 'session/error')) {
1047
				break;
1048
			}
1049
			const action = getActionEnvelope(next).action as { session: string; turnId: string; toolCallId: string; confirmed?: string };
1050
			if (!action.confirmed) {
1051
				client.notify('dispatchAction', {
1052
					clientSeq: 3,
1053
					action: {
1054
						type: 'session/toolCallConfirmed',
1055
						session: action.session,
1056
						turnId: action.turnId,
1057
						toolCallId: action.toolCallId,
1058
						approved: true,
1059
					},
1060
				});
1061
			}
1062
		}
1063
	});
1064

1065
	// ---- write_bash skipPermission regression test --------------------------
1066

1067
	test('write_bash never triggers a permission request (skipPermission flag)', async function () {
1068
		this.timeout(180_000);
1069

1070
		// What this test verifies:
1071
		//   `write_bash` (and `read_bash` / `bash_shutdown` / `list_bash`) are
1072
		//   registered as external tools with `skipPermission: true`, mirroring
1073
		//   the SDK's built-in shell helpers which never call `permissions.request`.
1074
		//   This regression test catches accidental removal of that flag — if it's
1075
		//   removed, the SDK will route write_bash through our permission flow and
1076
		//   the test will fail with `observedToolNames` containing 'write_bash'.
1077
		//
1078
		// How it works:
1079
		//   1. Allow-list permits ONLY `bash` (the interactive prompt). write_bash
1080
		//      is intentionally absent from the allow list.
1081
		//   2. The model is instructed to use `write_bash`. If any permission
1082
		//      request appears for write_bash, the loop records it in
1083
		//      `observedToolNames` and we fail the assertion.
1084
		//   3. We assert that bash actually ran AND that write_bash appeared in
1085
		//      toolCallStart notifications (so the test is non-vacuous — the model
1086
		//      actually tried to use the tool, not just piped input via bash).
1087

1088
		const tempDir = mkdtempSync(`${tmpdir()}/ahp-write-bash-skip-perm-`);
1089
		tempDirs.push(tempDir);
1090
		const sessionUri = await createRealSession(client, 'real-sdk-write-bash-skip-perm', createdSessions, URI.file(tempDir).toString());
1091

1092
		const approvalLoop = startBackgroundApprovalLoop(client, {
1093
			approvalSeqStart: 100,
1094
			allow: [
1095
				{
1096
					// Setup bash command — the interactive `read` prompt.
1097
					toolName: 'bash',
1098
					matchInput: input => !!input && input.includes('read') && input.includes('Got:'),
1099
				},
1100
				// Note: write_bash is intentionally NOT in the allow list. With
1101
				// skipPermission: true, the SDK won't ask us — so the test passes.
1102
				// Without it, the SDK would ask, the loop would deny + record an
1103
				// error, and the test would fail loudly.
1104
			],
1105
		});
1106

1107
		dispatchTurn(client, sessionUri, 'turn-write-bash-skip-perm',
1108
			'You MUST demonstrate the `write_bash` tool. Steps, in order:\n' +
1109
			'1. Use the `bash` tool to run exactly: read -p "Enter: " v; echo "Got: $v"\n' +
1110
			'   This will block waiting for stdin.\n' +
1111
			'2. While that bash call is waiting, you MUST use the `write_bash` tool to send the input "hello\\n" to it.\n' +
1112
			'   Do NOT pipe the input via the original bash command. Do NOT use `echo hello | ...`.\n' +
1113
			'   You MUST go through the `write_bash` tool — that is the entire point of this task.\n' +
1114
			'3. After the shell prints "Got: hello", reply with the single word "done".',
1115
			1);
1116

1117
		await client.waitForNotification(
1118
			n => isActionNotification(n, 'session/turnComplete') || isActionNotification(n, 'session/error'),
1119
			150_000,
1120
		);
1121
		await approvalLoop.stop();
1122

1123
		// Sanity check: the bash setup command actually ran. Otherwise the
1124
		// model ignored the prompt and the write_bash assertion below is vacuous.
1125
		assert.ok(approvalLoop.approvedToolNames.has('bash'),
1126
			`expected the model to invoke bash for setup; observed approved tools: ${[...approvalLoop.approvedToolNames].join(', ') || '<none>'}`);
1127

1128
		// Non-vacuousness check: write_bash must have actually been invoked
1129
		// (seen in a toolCallStart notification). If the model piped input via
1130
		// the original bash command instead of using write_bash, this fails.
1131
		const writeBashStarts = client.receivedNotifications(n => isActionNotification(n, 'session/toolCallStart'))
1132
			.map(n => getActionEnvelope(n).action as { toolName?: string })
1133
			.filter(a => a.toolName === 'write_bash');
1134
		assert.ok(writeBashStarts.length > 0,
1135
			`expected write_bash to be invoked at least once (toolCallStart), but it was never called. The model may have piped input via the original bash command instead.`);
1136

1137
		// The actual regression check: write_bash must never reach our
1138
		// permission handler. If this fails, `skipPermission: true` was likely
1139
		// removed from copilotShellTools.ts.
1140
		assert.ok(!approvalLoop.observedToolNames.has('write_bash'),
1141
			`write_bash should be auto-approved by the SDK (skipPermission: true) and never trigger a permission request, but the test observed one. Observed permission requests: ${[...approvalLoop.observedToolNames].join(', ')}`);
1142

1143
		// Any other unexpected permission requests (e.g. an unrelated tool the
1144
		// model decided to use) would also have been recorded as errors.
1145
		assert.deepStrictEqual(approvalLoop.errors, [],
1146
			`unexpected approval-loop errors: ${approvalLoop.errors.join('; ')}`);
1147
	});
1148
});
1149

1150
Product

Resources

Company