CoCalc -- mock-llm-server.js

GitHub Repository: microsoft/vscode
Path: blob/main/scripts/chat-simulation/common/mock-llm-server.js
¹³³⁸³ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
// @ts-check
7

8
/**
9
 * Local mock server that implements the OpenAI Chat Completions streaming API.
10
 * Used by the chat perf benchmark to replace the real LLM backend with
11
 * deterministic, zero-latency responses.
12
 *
13
 * Supports scenario-based responses: the `messages` array's last user message
14
 * content is matched against scenario IDs. Unknown scenarios get a default
15
 * text-only response.
16
 */
17

18
const http = require('http');
19
const path = require('path');
20
const { EventEmitter } = require('events');
21

22
const ROOT = path.join(__dirname, '..', '..', '..');
23

24
// -- Scenario fixtures -------------------------------------------------------
25

26
/**
27
 * @typedef {{ content: string, delayMs: number }} StreamChunk
28
 */
29

30
/**
31
 * A single turn in a multi-turn scenario.
32
 *
33
 * @typedef {{
34
 *   kind: 'tool-calls',
35
 *   toolCalls: Array<{ toolNamePattern: RegExp, arguments: Record<string, any> }>,
36
 * } | {
37
 *   kind: 'content',
38
 *   chunks: StreamChunk[],
39
 * } | {
40
 *   kind: 'thinking',
41
 *   thinkingChunks: StreamChunk[],
42
 *   chunks: StreamChunk[],
43
 * } | {
44
 *   kind: 'user',
45
 *   message: string,
46
 * }} ScenarioTurn
47
 */
48

49
/**
50
 * A scenario turn produced by the model.
51
 *
52
 * @typedef {{
53
 *   kind: 'tool-calls',
54
 *   toolCalls: Array<{ toolNamePattern: RegExp, arguments: Record<string, any> }>,
55
 * } | {
56
 *   kind: 'content',
57
 *   chunks: StreamChunk[],
58
 * } | {
59
 *   kind: 'thinking',
60
 *   thinkingChunks: StreamChunk[],
61
 *   chunks: StreamChunk[],
62
 * }} ModelScenarioTurn
63
 */
64

65
/**
66
 * A model turn that emits content chunks.
67
 *
68
 * @typedef {{
69
 *   kind: 'content',
70
 *   chunks: StreamChunk[],
71
 * } | {
72
 *   kind: 'thinking',
73
 *   thinkingChunks: StreamChunk[],
74
 *   chunks: StreamChunk[],
75
 * }} ContentScenarioTurn
76
 */
77

78
/**
79
 * A multi-turn scenario — an ordered sequence of turns.
80
 * The mock server determines which model turn to serve based on the number
81
 * of assistant→tool round-trips already present in the conversation.
82
 * User turns are skipped by the server and instead injected by the test
83
 * harness, which types them into the chat input and presses Enter.
84
 *
85
 * @typedef {{
86
 *   type: 'multi-turn',
87
 *   turns: ScenarioTurn[],
88
 * }} MultiTurnScenario
89
 */
90

91
/**
92
 * @param {any} scenario
93
 * @returns {scenario is MultiTurnScenario}
94
 */
95
function isMultiTurnScenario(scenario) {
96
	return scenario && typeof scenario === 'object' && scenario.type === 'multi-turn';
97
}
98

99
/**
100
 * Helper for building scenario chunk sequences with timing control.
101
 */
102
class ScenarioBuilder {
103
	constructor() {
104
		/** @type {StreamChunk[]} */
105
		this.chunks = [];
106
	}
107

108
	/**
109
	 * Emit a content chunk immediately (no delay before it).
110
	 * @param {string} content
111
	 * @returns {this}
112
	 */
113
	emit(content) {
114
		this.chunks.push({ content, delayMs: 0 });
115
		return this;
116
	}
117

118
	/**
119
	 * Wait, then emit a content chunk — simulates network/token generation latency.
120
	 * @param {number} ms - delay in milliseconds before this chunk
121
	 * @param {string} content
122
	 * @returns {this}
123
	 */
124
	wait(ms, content) {
125
		this.chunks.push({ content, delayMs: ms });
126
		return this;
127
	}
128

129
	/**
130
	 * Emit multiple chunks with uniform inter-chunk delay.
131
	 * @param {string[]} contents
132
	 * @param {number} [delayMs=15] - delay between each chunk (default ~1 frame)
133
	 * @returns {this}
134
	 */
135
	stream(contents, delayMs = 15) {
136
		for (const content of contents) {
137
			this.chunks.push({ content, delayMs });
138
		}
139
		return this;
140
	}
141

142
	/**
143
	 * Emit multiple chunks with no delay (burst).
144
	 * @param {string[]} contents
145
	 * @returns {this}
146
	 */
147
	burst(contents) {
148
		return this.stream(contents, 0);
149
	}
150

151
	/** @returns {StreamChunk[]} */
152
	build() {
153
		return this.chunks;
154
	}
155
}
156

157
/** @type {Record<string, StreamChunk[] | MultiTurnScenario>} */
158
const SCENARIOS = /** @type {Record<string, StreamChunk[] | MultiTurnScenario>} */ ({});
159

160
const DEFAULT_SCENARIO = 'text-only';
161

162
/**
163
 * @returns {StreamChunk[]}
164
 */
165
function getDefaultScenarioChunks() {
166
	const scenario = SCENARIOS[DEFAULT_SCENARIO];
167
	if (isMultiTurnScenario(scenario)) {
168
		throw new Error(`Default scenario '${DEFAULT_SCENARIO}' must be content-only`);
169
	}
170
	return scenario;
171
}
172

173
// -- SSE chunk builder -------------------------------------------------------
174

175
const MODEL = 'gpt-4o-2024-08-06';
176

177
/**
178
 * @param {string} content
179
 * @param {number} index
180
 * @param {boolean} finish
181
 */
182
function makeChunk(content, index, finish) {
183
	return {
184
		id: 'chatcmpl-perf-benchmark',
185
		object: 'chat.completion.chunk',
186
		created: Math.floor(Date.now() / 1000),
187
		model: MODEL,
188
		choices: [{
189
			index: 0,
190
			delta: finish ? {} : { content },
191
			finish_reason: finish ? 'stop' : null,
192
			content_filter_results: {},
193
		}],
194
		usage: null,
195
	};
196
}
197

198
function makeInitialChunk() {
199
	return {
200
		id: 'chatcmpl-perf-benchmark',
201
		object: 'chat.completion.chunk',
202
		created: Math.floor(Date.now() / 1000),
203
		model: MODEL,
204
		choices: [{
205
			index: 0,
206
			delta: { role: 'assistant', content: '' },
207
			finish_reason: null,
208
			content_filter_results: {},
209
		}],
210
		usage: null,
211
	};
212
}
213

214
/**
215
 * Build a tool-call initial chunk (role only, no content).
216
 */
217
function makeToolCallInitialChunk() {
218
	return {
219
		id: 'chatcmpl-perf-benchmark',
220
		object: 'chat.completion.chunk',
221
		created: Math.floor(Date.now() / 1000),
222
		model: MODEL,
223
		choices: [{
224
			index: 0,
225
			delta: { role: 'assistant', content: null },
226
			finish_reason: null,
227
			content_filter_results: {},
228
		}],
229
		usage: null,
230
	};
231
}
232

233
/**
234
 * Build a tool-call function-start chunk.
235
 * @param {number} index - tool call index
236
 * @param {string} callId - unique call ID
237
 * @param {string} functionName - tool function name
238
 */
239
function makeToolCallStartChunk(index, callId, functionName) {
240
	return {
241
		id: 'chatcmpl-perf-benchmark',
242
		object: 'chat.completion.chunk',
243
		created: Math.floor(Date.now() / 1000),
244
		model: MODEL,
245
		choices: [{
246
			index: 0,
247
			delta: {
248
				tool_calls: [{
249
					index,
250
					id: callId,
251
					type: 'function',
252
					function: { name: functionName, arguments: '' },
253
				}],
254
			},
255
			finish_reason: null,
256
			content_filter_results: {},
257
		}],
258
		usage: null,
259
	};
260
}
261

262
/**
263
 * Build a tool-call arguments chunk.
264
 * @param {number} index - tool call index
265
 * @param {string} argsFragment - partial JSON arguments
266
 */
267
function makeToolCallArgsChunk(index, argsFragment) {
268
	return {
269
		id: 'chatcmpl-perf-benchmark',
270
		object: 'chat.completion.chunk',
271
		created: Math.floor(Date.now() / 1000),
272
		model: MODEL,
273
		choices: [{
274
			index: 0,
275
			delta: {
276
				tool_calls: [{
277
					index,
278
					function: { arguments: argsFragment },
279
				}],
280
			},
281
			finish_reason: null,
282
			content_filter_results: {},
283
		}],
284
		usage: null,
285
	};
286
}
287

288
/**
289
 * Build a tool-call finish chunk.
290
 */
291
function makeToolCallFinishChunk() {
292
	return {
293
		id: 'chatcmpl-perf-benchmark',
294
		object: 'chat.completion.chunk',
295
		created: Math.floor(Date.now() / 1000),
296
		model: MODEL,
297
		choices: [{
298
			index: 0,
299
			delta: {},
300
			finish_reason: 'tool_calls',
301
			content_filter_results: {},
302
		}],
303
		usage: null,
304
	};
305
}
306

307
/**
308
 * Build a thinking (chain-of-thought summary) chunk.
309
 * Uses the `cot_summary` field in the delta, matching the Copilot API wire format.
310
 * @param {string} text - thinking text fragment
311
 */
312
function makeThinkingChunk(text) {
313
	return {
314
		id: 'chatcmpl-perf-benchmark',
315
		object: 'chat.completion.chunk',
316
		created: Math.floor(Date.now() / 1000),
317
		model: MODEL,
318
		choices: [{
319
			index: 0,
320
			delta: { cot_summary: text },
321
			finish_reason: null,
322
			content_filter_results: {},
323
		}],
324
		usage: null,
325
	};
326
}
327

328
/**
329
 * Build a thinking ID chunk (sent after thinking text to close the block).
330
 * @param {string} cotId - unique chain-of-thought ID
331
 */
332
function makeThinkingIdChunk(cotId) {
333
	return {
334
		id: 'chatcmpl-perf-benchmark',
335
		object: 'chat.completion.chunk',
336
		created: Math.floor(Date.now() / 1000),
337
		model: MODEL,
338
		choices: [{
339
			index: 0,
340
			delta: { cot_id: cotId },
341
			finish_reason: null,
342
			content_filter_results: {},
343
		}],
344
		usage: null,
345
	};
346
}
347

348
// -- Request handler ---------------------------------------------------------
349

350
/**
351
 * @param {http.IncomingMessage} req
352
 * @param {http.ServerResponse} res
353
 */
354
function handleRequest(req, res) {
355
	const contentLength = req.headers['content-length'] || '0';
356
	const ts = new Date().toISOString().slice(11, -1); // HH:MM:SS.mmm
357
	console.log(`[mock-llm] ${ts} ${req.method} ${req.url} (${contentLength} bytes)`);
358

359
	// CORS
360
	res.setHeader('Access-Control-Allow-Origin', '*');
361
	res.setHeader('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS');
362
	res.setHeader('Access-Control-Allow-Headers', '*');
363
	if (req.method === 'OPTIONS') { res.writeHead(204); res.end(); return; }
364

365
	const url = new URL(req.url || '/', `http://${req.headers.host}`);
366
	const path = url.pathname;
367
	const json = (/** @type {number} */ status, /** @type {any} */ data) => {
368
		res.writeHead(status, { 'Content-Type': 'application/json' });
369
		res.end(JSON.stringify(data));
370
	};
371
	const readBody = () => new Promise(resolve => {
372
		let body = '';
373
		req.on('data', chunk => { body += chunk; });
374
		req.on('end', () => resolve(body));
375
	});
376

377
	// -- Health -------------------------------------------------------
378
	if (path === '/health') { res.writeHead(200); res.end('ok'); return; }
379

380
	// -- Token endpoints (DomainService.tokenURL / tokenNoAuthURL) ----
381
	// /copilot_internal/v2/token, /copilot_internal/v2/nltoken
382
	if (path.startsWith('/copilot_internal/')) {
383
		if (path.includes('/token') || path.includes('/nltoken')) {
384
			json(200, {
385
				token: 'perf-benchmark-fake-token',
386
				expires_at: Math.floor(Date.now() / 1000) + 3600,
387
				refresh_in: 1800,
388
				sku: 'free_limited_copilot',
389
				individual: true,
390
				copilot_plan: 'free',
391
				endpoints: {
392
					api: `http://${req.headers.host}`,
393
					proxy: `http://${req.headers.host}`,
394
				},
395
			});
396
		} else {
397
			// /copilot_internal/user, /copilot_internal/content_exclusion, etc.
398
			json(200, {});
399
		}
400
		return;
401
	}
402

403
	// -- Telemetry (DomainService.telemetryURL) ----------------------
404
	if (path === '/telemetry') { json(200, {}); return; }
405

406
	// -- Model Router (DomainService.capiModelRouterURL = /models/session/intent) --
407
	// The automode service POSTs here to get the best model for a request.
408
	if (path === '/models/session/intent' && req.method === 'POST') {
409
		readBody().then(() => {
410
			json(200, { model: MODEL });
411
		});
412
		return;
413
	}
414

415
	// -- Auto Models / Model Session (DomainService.capiAutoModelURL = /models/session) --
416
	// Returns AutoModeAPIResponse: { available_models, session_token, expires_at }
417
	if (path === '/models/session' && req.method === 'POST') {
418
		readBody().then(() => {
419
			json(200, {
420
				available_models: [MODEL, 'gpt-4o-mini'],
421
				session_token: 'perf-session-token-' + Date.now(),
422
				expires_at: Math.floor(Date.now() / 1000) + 3600,
423
				discounted_costs: {},
424
			});
425
		});
426
		return;
427
	}
428

429
	// -- Models (DomainService.capiModelsURL = /models) --------------
430
	if (path === '/models' && req.method === 'GET') {
431
		json(200, {
432
			data: [
433
				{
434
					id: MODEL,
435
					name: 'GPT-4o (Mock)',
436
					version: '2024-05-13',
437
					vendor: 'copilot',
438
					model_picker_enabled: true,
439
					is_chat_default: true,
440
					is_chat_fallback: true,
441
					billing: { is_premium: false, multiplier: 0 },
442
					capabilities: {
443
						type: 'chat',
444
						family: 'gpt-4o',
445
						tokenizer: 'o200k_base',
446
						limits: {
447
							// Use a very large token limit so the Responses API compaction
448
							// threshold (90% of max_prompt_tokens) is never reached during
449
							// perf benchmarks.
450
							max_prompt_tokens: 10000000,
451
							max_output_tokens: 131072,
452
							max_context_window_tokens: 10000000,
453
						},
454
						supports: {
455
							streaming: true,
456
							tool_calls: true,
457
							parallel_tool_calls: true,
458
							vision: false,
459
						},
460
					},
461
					supported_endpoints: ['/chat/completions'],
462
				},
463
				{
464
					id: 'gpt-4o-mini',
465
					name: 'GPT-4o mini (Mock)',
466
					version: '2024-07-18',
467
					vendor: 'copilot',
468
					model_picker_enabled: false,
469
					is_chat_default: false,
470
					is_chat_fallback: false,
471
					billing: { is_premium: false, multiplier: 0 },
472
					capabilities: {
473
						type: 'chat',
474
						family: 'gpt-4o-mini',
475
						tokenizer: 'o200k_base',
476
						limits: {
477
							max_prompt_tokens: 10000000,
478
							max_output_tokens: 131072,
479
							max_context_window_tokens: 10000000,
480
						},
481
						supports: {
482
							streaming: true,
483
							tool_calls: true,
484
							parallel_tool_calls: true,
485
							vision: false,
486
						},
487
					},
488
					supported_endpoints: ['/chat/completions'],
489
				},
490
			],
491
		});
492
		return;
493
	}
494

495
	// -- Model by ID (DomainService.capiModelsURL/{id}) --------------
496
	if (path.startsWith('/models/') && req.method === 'GET') {
497
		const modelId = path.split('/models/')[1]?.split('/')[0];
498
		if (path.endsWith('/policy')) {
499
			json(200, { state: 'accepted', terms: '' });
500
			return;
501
		}
502
		json(200, {
503
			id: modelId || MODEL,
504
			name: 'GPT-4o (Mock)',
505
			version: '2024-05-13',
506
			vendor: 'copilot',
507
			model_picker_enabled: true,
508
			is_chat_default: true,
509
			is_chat_fallback: true,
510
			capabilities: {
511
				type: 'chat',
512
				family: 'gpt-4o',
513
				tokenizer: 'o200k_base',
514
				limits: { max_prompt_tokens: 10000000, max_output_tokens: 131072, max_context_window_tokens: 10000000 },
515
				supports: { streaming: true, tool_calls: true, parallel_tool_calls: true, vision: false },
516
			},
517
		});
518
		return;
519
	}
520

521
	// -- Agents (DomainService.remoteAgentsURL = /agents) -------------
522
	if (path.startsWith('/agents')) {
523
		// /agents/sessions — CopilotSessions
524
		if (path.includes('/sessions')) {
525
			json(200, { sessions: [], total_count: 0, page_size: 20, page_number: 1 });
526
		}
527
		// /agents/swe/models — CCAModelsList
528
		else if (path.includes('/swe/models')) {
529
			json(200, {
530
				data: [{
531
					id: MODEL, name: 'GPT-4o (Mock)', vendor: 'copilot',
532
					capabilities: { type: 'chat', family: 'gpt-4o', supports: { streaming: true } }
533
				}]
534
			});
535
		}
536
		// /agents/swe/... — agent jobs, etc.
537
		else if (path.includes('/swe/')) {
538
			json(200, {});
539
		}
540
		// /agents — list agents
541
		else {
542
			json(200, { agents: [] });
543
		}
544
		return;
545
	}
546

547
	// -- Chat Completions (DomainService.capiChatURL = /chat/completions) --
548
	if (path === '/chat/completions' && req.method === 'POST') {
549
		readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res));
550
		return;
551
	}
552

553
	// -- Responses API (DomainService.capiResponsesURL = /responses) --
554
	if (path === '/responses' && req.method === 'POST') {
555
		readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res));
556
		return;
557
	}
558

559
	// -- Messages API (DomainService.capiMessagesURL = /v1/messages) --
560
	if (path === '/v1/messages' && req.method === 'POST') {
561
		readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res));
562
		return;
563
	}
564

565
	// -- Proxy completions (/v1/engines/*/completions) ----------------
566
	if (path.includes('/v1/engines/') && req.method === 'POST') {
567
		readBody().then((/** @type {string} */ body) => handleChatCompletions(body, res));
568
		return;
569
	}
570

571
	// -- Skills, Search, Embeddings -----------------------------------
572
	if (path === '/skills' || path.startsWith('/search/') || path.startsWith('/embeddings')) {
573
		json(200, { data: [] });
574
		return;
575
	}
576

577
	// -- Catch-all: any remaining POST with messages → chat completions
578
	if (req.method === 'POST') {
579
		readBody().then((/** @type {string} */ body) => {
580
			try {
581
				const parsed = JSON.parse(/** @type {string} */(body));
582
				if (parsed.messages && Array.isArray(parsed.messages)) {
583
					handleChatCompletions(/** @type {string} */(body), res);
584
					return;
585
				}
586
			} catch { }
587
			json(200, {});
588
		});
589
		return;
590
	}
591

592
	// -- Catch-all GET → empty success --------------------------------
593
	json(200, {});
594
}
595

596
// -- Server lifecycle --------------------------------------------------------
597

598
/** Emitted when a scenario chat completion is fully served. */
599
const serverEvents = new EventEmitter();
600

601
/** @param {number} ms */
602
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
603

604
/**
605
 * Count the number of model turns already completed for the CURRENT scenario.
606
 * Only counts assistant messages that appear after the last user message
607
 * containing a [scenario:X] tag. This prevents assistant messages from
608
 * previous scenarios (in the same chat session) from inflating the count.
609
 *
610
 * @param {any[]} messages
611
 * @returns {number}
612
 */
613
function countCompletedModelTurns(messages) {
614
	// Find the index of the last user message with a scenario tag
615
	let scenarioMsgIdx = -1;
616
	for (let i = messages.length - 1; i >= 0; i--) {
617
		const msg = messages[i];
618
		if (msg.role !== 'user') { continue; }
619
		const content = typeof msg.content === 'string'
620
			? msg.content
621
			: Array.isArray(msg.content)
622
				? msg.content.map((/** @type {any} */ c) => c.text || '').join('')
623
				: '';
624
		if (/\[scenario:[^\]]+\]/.test(content)) {
625
			scenarioMsgIdx = i;
626
			break;
627
		}
628
	}
629

630
	// Count assistant messages after the scenario tag message
631
	let turns = 0;
632
	const startIdx = scenarioMsgIdx >= 0 ? scenarioMsgIdx + 1 : 0;
633
	for (let i = startIdx; i < messages.length; i++) {
634
		if (messages[i].role === 'assistant') {
635
			turns++;
636
		}
637
	}
638
	return turns;
639
}
640

641
/**
642
 * Compute the model-turn index for the current request given the scenario's
643
 * turn list. User turns are skipped (they're handled by the test harness)
644
 * and do not consume a model turn index.
645
 *
646
 * The algorithm counts completed assistant messages in the conversation
647
 * history (each one = one served model turn), then maps that to the
648
 * n-th model turn in the scenario (skipping user turns).
649
 *
650
 * @param {ScenarioTurn[]} turns
651
 * @param {any[]} messages
652
 * @returns {{ turn: ModelScenarioTurn, turnIndex: number }}
653
 */
654
function resolveCurrentTurn(turns, messages) {
655
	const completedModelTurns = countCompletedModelTurns(messages);
656
	// Build the model-only turn list (skip user turns)
657
	const modelTurns = /** @type {ModelScenarioTurn[]} */ (turns.filter(t => t.kind !== 'user'));
658
	const idx = Math.min(completedModelTurns, modelTurns.length - 1);
659
	return { turn: modelTurns[idx], turnIndex: idx };
660
}
661

662
/**
663
 * @param {string} body
664
 * @param {http.ServerResponse} res
665
 */
666
async function handleChatCompletions(body, res) {
667
	let scenarioId = DEFAULT_SCENARIO;
668
	let isScenarioRequest = false;
669
	/** @type {string[]} */
670
	let requestToolNames = [];
671
	/** @type {any[]} */
672
	let messages = [];
673
	try {
674
		const parsed = JSON.parse(body);
675
		messages = parsed.messages || [];
676
		// Log user messages for debugging
677
		const userMsgs = messages.filter((/** @type {any} */ m) => m.role === 'user');
678
		if (userMsgs.length > 0) {
679
			const lastContent = typeof userMsgs[userMsgs.length - 1].content === 'string'
680
				? userMsgs[userMsgs.length - 1].content.substring(0, 100)
681
				: '(structured)';
682
			const ts = new Date().toISOString().slice(11, -1);
683
			console.log(`[mock-llm]   ${ts} → ${messages.length} msgs, last user: "${lastContent}"`);
684
		}
685
		// Extract available tool names from the request's tools array
686
		const tools = parsed.tools || [];
687
		requestToolNames = tools.map((/** @type {any} */ t) => t.function?.name).filter(Boolean);
688
		if (requestToolNames.length > 0) {
689
			const ts = new Date().toISOString().slice(11, -1);
690
			console.log(`[mock-llm]   ${ts} → ${requestToolNames.length} tools available: ${requestToolNames.join(', ')}`);
691
		}
692

693
		// Search user messages in reverse order (newest first) for the scenario
694
		// tag. This ensures the most recent message's tag takes precedence when
695
		// multiple messages with different tags exist in the same conversation
696
		// (e.g. in the leak checker which sends many scenarios in one session).
697
		// Follow-up user messages in multi-turn scenarios won't have a tag, so
698
		// searching backwards still finds the correct tag from the initial message.
699
		for (let mi = messages.length - 1; mi >= 0; mi--) {
700
			const msg = messages[mi];
701
			if (msg.role !== 'user') { continue; }
702
			const content = typeof msg.content === 'string'
703
				? msg.content
704
				: Array.isArray(msg.content)
705
					? msg.content.map((/** @type {any} */ c) => c.text || '').join('')
706
					: '';
707
			const match = content.match(/\[scenario:([^\]]+)\]/);
708
			if (match && SCENARIOS[match[1]]) {
709
				scenarioId = match[1];
710
				isScenarioRequest = true;
711
				break;
712
			}
713
		}
714
	} catch { }
715

716
	const scenario = SCENARIOS[scenarioId] || SCENARIOS[DEFAULT_SCENARIO];
717

718
	res.writeHead(200, {
719
		'Content-Type': 'text/event-stream',
720
		'Cache-Control': 'no-cache',
721
		'Connection': 'keep-alive',
722
		'X-Request-Id': 'perf-benchmark-' + Date.now(),
723
	});
724

725
	// Handle multi-turn scenarios — only when the request actually has tools.
726
	// Ancillary requests (title generation, progress messages) also contain the
727
	// [scenario:...] tag but don't send tools, so they fall through to content.
728
	if (isMultiTurnScenario(scenario) && requestToolNames.length > 0) {
729
		const { turn, turnIndex } = resolveCurrentTurn(scenario.turns, messages);
730
		const modelTurnCount = scenario.turns.filter(t => t.kind !== 'user').length;
731

732
		const ts = new Date().toISOString().slice(11, -1);
733
		console.log(`[mock-llm]   ${ts} → multi-turn scenario ${scenarioId}, model turn ${turnIndex + 1}/${modelTurnCount} (${turn.kind}), ${countCompletedModelTurns(messages)} completed turns in history`);
734

735
		if (turn.kind === 'tool-calls') {
736
			await streamToolCalls(res, turn.toolCalls, requestToolNames, scenarioId);
737
			return;
738
		}
739

740
		if (turn.kind === 'thinking') {
741
			await streamThinkingThenContent(res, turn.thinkingChunks, turn.chunks, isScenarioRequest);
742
			return;
743
		}
744

745
		// kind === 'content' — stream the final text response
746
		await streamContent(res, turn.chunks, isScenarioRequest);
747
		return;
748
	}
749

750
	// Standard content-only scenario (or multi-turn scenario falling back for
751
	// ancillary requests like title generation that don't include tools)
752
	const chunks = isMultiTurnScenario(scenario)
753
		? getFirstContentTurn(scenario)
754
		: /** @type {StreamChunk[]} */ (scenario);
755

756
	await streamContent(res, chunks, isScenarioRequest);
757
}
758

759
/**
760
 * Get the chunks from the first content turn of a multi-turn scenario,
761
 * used as fallback text for ancillary requests (title generation etc).
762
 * @param {MultiTurnScenario} scenario
763
 * @returns {StreamChunk[]}
764
 */
765
function getFirstContentTurn(scenario) {
766
	/** @type {ContentScenarioTurn | undefined} */
767
	let contentTurn;
768
	for (const turn of scenario.turns) {
769
		if (turn.kind === 'content') {
770
			contentTurn = turn;
771
			break;
772
		}
773
		if (turn.kind === 'thinking') {
774
			contentTurn = turn;
775
			break;
776
		}
777
	}
778
	return contentTurn?.chunks ?? getDefaultScenarioChunks();
779
}
780

781
/**
782
 * Stream content chunks as a standard SSE response.
783
 * @param {http.ServerResponse} res
784
 * @param {StreamChunk[]} chunks
785
 * @param {boolean} isScenarioRequest
786
 */
787
async function streamContent(res, chunks, isScenarioRequest) {
788
	res.write(`data: ${JSON.stringify(makeInitialChunk())}\n\n`);
789

790
	for (const chunk of chunks) {
791
		if (chunk.delayMs > 0) { await sleep(chunk.delayMs); }
792
		res.write(`data: ${JSON.stringify(makeChunk(chunk.content, 0, false))}\n\n`);
793
	}
794

795
	res.write(`data: ${JSON.stringify(makeChunk('', 0, true))}\n\n`);
796
	res.write('data: [DONE]\n\n');
797
	res.end();
798

799
	if (isScenarioRequest) {
800
		serverEvents.emit('scenarioCompletion');
801
	}
802
}
803

804
/**
805
 * Stream thinking chunks followed by content chunks as an SSE response.
806
 * Thinking is emitted as `cot_summary` deltas, then a `cot_id` to close the
807
 * thinking block, followed by standard content deltas.
808
 * @param {http.ServerResponse} res
809
 * @param {StreamChunk[]} thinkingChunks
810
 * @param {StreamChunk[]} contentChunks
811
 * @param {boolean} isScenarioRequest
812
 */
813
async function streamThinkingThenContent(res, thinkingChunks, contentChunks, isScenarioRequest) {
814
	res.write(`data: ${JSON.stringify(makeInitialChunk())}\n\n`);
815

816
	// Stream thinking text
817
	for (const chunk of thinkingChunks) {
818
		if (chunk.delayMs > 0) { await sleep(chunk.delayMs); }
819
		res.write(`data: ${JSON.stringify(makeThinkingChunk(chunk.content))}\n\n`);
820
	}
821

822
	// Close thinking block with ID
823
	const cotId = `cot_perf_${Date.now()}`;
824
	res.write(`data: ${JSON.stringify(makeThinkingIdChunk(cotId))}\n\n`);
825
	await sleep(10);
826

827
	// Stream content
828
	for (const chunk of contentChunks) {
829
		if (chunk.delayMs > 0) { await sleep(chunk.delayMs); }
830
		res.write(`data: ${JSON.stringify(makeChunk(chunk.content, 0, false))}\n\n`);
831
	}
832

833
	res.write(`data: ${JSON.stringify(makeChunk('', 0, true))}\n\n`);
834
	res.write('data: [DONE]\n\n');
835
	res.end();
836

837
	if (isScenarioRequest) {
838
		serverEvents.emit('scenarioCompletion');
839
	}
840
}
841

842
/**
843
 * Stream tool call chunks as an SSE response.
844
 * @param {http.ServerResponse} res
845
 * @param {Array<{ toolNamePattern: RegExp, arguments: Record<string, any> }>} toolCalls
846
 * @param {string[]} requestToolNames
847
 * @param {string} scenarioId
848
 */
849
async function streamToolCalls(res, toolCalls, requestToolNames, scenarioId) {
850
	res.write(`data: ${JSON.stringify(makeToolCallInitialChunk())}\n\n`);
851

852
	for (let i = 0; i < toolCalls.length; i++) {
853
		const call = toolCalls[i];
854
		const callId = `call_perf_${scenarioId}_${i}_${Date.now()}`;
855

856
		// Find the matching tool name from the request's tools array
857
		let toolName = requestToolNames.find(name => call.toolNamePattern.test(name));
858
		if (!toolName) {
859
			toolName = call.toolNamePattern.source.replace(/[\\.|?*+^${}()\[\]]/g, '');
860
			console.warn(`[mock-llm]   No matching tool for pattern ${call.toolNamePattern}, using fallback: ${toolName}`);
861
		}
862

863
		// Stream tool call: start chunk, then arguments in fragments
864
		res.write(`data: ${JSON.stringify(makeToolCallStartChunk(i, callId, toolName))}\n\n`);
865
		await sleep(10);
866

867
		const argsJson = JSON.stringify(call.arguments);
868
		const fragmentSize = Math.max(20, Math.ceil(argsJson.length / 4));
869
		for (let pos = 0; pos < argsJson.length; pos += fragmentSize) {
870
			const fragment = argsJson.slice(pos, pos + fragmentSize);
871
			res.write(`data: ${JSON.stringify(makeToolCallArgsChunk(i, fragment))}\n\n`);
872
			await sleep(5);
873
		}
874
	}
875

876
	res.write(`data: ${JSON.stringify(makeToolCallFinishChunk())}\n\n`);
877
	res.write('data: [DONE]\n\n');
878
	res.end();
879
}
880

881
/**
882
 * Start the mock server and return a handle.
883
 * @param {number} port
884
 */
885
function startServer(port = 0) {
886
	return new Promise((resolve, reject) => {
887
		let reqCount = 0;
888
		let completions = 0;
889
		/** @type {Array<() => boolean>} */
890
		let requestWaiters = [];
891
		/** @type {Array<() => boolean>} */
892
		let completionWaiters = [];
893

894
		const onCompletion = () => {
895
			completions++;
896
			completionWaiters = completionWaiters.filter(fn => !fn());
897
		};
898
		serverEvents.on('scenarioCompletion', onCompletion);
899

900
		const server = http.createServer((req, res) => {
901
			reqCount++;
902
			requestWaiters = requestWaiters.filter(fn => !fn());
903
			handleRequest(req, res);
904
		});
905
		server.listen(port, '127.0.0.1', () => {
906
			const addr = server.address();
907
			const actualPort = typeof addr === 'object' && addr ? addr.port : port;
908
			const url = `http://127.0.0.1:${actualPort}`;
909
			resolve({
910
				port: actualPort,
911
				url,
912
				close: () => /** @type {Promise<void>} */(new Promise((resolve, reject) => {
913
					serverEvents.removeListener('scenarioCompletion', onCompletion);
914
					server.close(err => err ? reject(err) : resolve(undefined));
915
				})),
916
				/** Return total request count. */
917
				requestCount: () => reqCount,
918
				/**
919
				 * Wait until at least `n` requests have been received.
920
				 * @param {number} n
921
				 * @param {number} timeoutMs
922
				 * @returns {Promise<void>}
923
				 */
924
				waitForRequests: (n, timeoutMs) => new Promise((resolve, reject) => {
925
					if (reqCount >= n) { resolve(); return; }
926
					const timer = setTimeout(() => reject(new Error(`Timed out waiting for ${n} requests (got ${reqCount})`)), timeoutMs);
927
					requestWaiters.push(() => {
928
						if (reqCount >= n) { clearTimeout(timer); resolve(); return true; }
929
						return false;
930
					});
931
				}),
932
				/** Return total scenario-completion count. */
933
				completionCount: () => completions,
934
				/**
935
				 * Wait until at least `n` scenario chat completions have been served.
936
				 * @param {number} n
937
				 * @param {number} timeoutMs
938
				 * @returns {Promise<void>}
939
				 */
940
				waitForCompletion: (n, timeoutMs) => new Promise((resolve, reject) => {
941
					if (completions >= n) { resolve(); return; }
942
					const timer = setTimeout(() => reject(new Error(`Timed out waiting for ${n} completions (got ${completions})`)), timeoutMs);
943
					completionWaiters.push(() => {
944
						if (completions >= n) { clearTimeout(timer); resolve(); return true; }
945
						return false;
946
					});
947
				}),
948
			});
949
		});
950
		server.on('error', reject);
951
	});
952
}
953

954
// Allow running standalone for testing: node scripts/mock-llm-server.js
955
if (require.main === module) {
956
	const { registerPerfScenarios } = require('./perf-scenarios');
957
	registerPerfScenarios();
958
	const port = parseInt(process.argv[2] || '0', 10);
959
	startServer(port).then((/** @type {any} */ handle) => {
960
		console.log(`Mock LLM server listening at ${handle.url}`);
961
		console.log('Scenarios:', Object.keys(SCENARIOS).join(', '));
962
	});
963
}
964

965
/**
966
 * Get the user follow-up messages for a scenario, in order.
967
 * Returns an array of { message, afterModelTurn } objects where afterModelTurn
968
 * is the 0-based index of the model turn after which this user message should
969
 * be injected.
970
 * @param {string} scenarioId
971
 * @returns {Array<{ message: string, afterModelTurn: number }>}
972
 */
973
function getUserTurns(scenarioId) {
974
	const scenario = SCENARIOS[scenarioId];
975
	if (!isMultiTurnScenario(scenario)) { return []; }
976
	const result = [];
977
	let modelTurnsSeen = 0;
978
	for (const turn of scenario.turns) {
979
		if (turn.kind === 'user') {
980
			result.push({ message: turn.message, afterModelTurn: modelTurnsSeen });
981
		} else {
982
			modelTurnsSeen++;
983
		}
984
	}
985
	return result;
986
}
987

988
/**
989
 * Get the total number of model turns (non-user turns) in a scenario.
990
 * @param {string} scenarioId
991
 * @returns {number}
992
 */
993
function getModelTurnCount(scenarioId) {
994
	const scenario = SCENARIOS[scenarioId];
995
	if (!isMultiTurnScenario(scenario)) { return 1; }
996
	return scenario.turns.filter(t => t.kind !== 'user').length;
997
}
998

999
/**
1000
 * Register a scenario dynamically. Test files call this to add
1001
 * scenarios that are only relevant to them.
1002
 * @param {string} id - unique scenario identifier
1003
 * @param {StreamChunk[] | MultiTurnScenario} definition - scenario data
1004
 */
1005
function registerScenario(id, definition) {
1006
	SCENARIOS[id] = definition;
1007
}
1008

1009
/**
1010
 * Return the IDs of all currently registered scenarios.
1011
 * @returns {string[]}
1012
 */
1013
function getScenarioIds() {
1014
	return Object.keys(SCENARIOS);
1015
}
1016

1017
module.exports = { startServer, SCENARIOS, ScenarioBuilder, registerScenario, getScenarioIds, getUserTurns, getModelTurnCount };
1018

1019
Product

Resources

Company