CoCalc -- promptCategorizationTaxonomy.ts

GitHub Repository: microsoft/vscode
Path: blob/main/extensions/copilot/src/extension/prompt/common/promptCategorizationTaxonomy.ts
¹³³⁹⁹ views
1
/*---------------------------------------------------------------------------------------------
2
 *  Copyright (c) Microsoft Corporation. All rights reserved.
3
 *  Licensed under the MIT License. See License.txt in the project root for license information.
4
 *--------------------------------------------------------------------------------------------*/
5

6
/**
7
 * Domain + Intent + Scope + Time Estimate classification taxonomy.
8
 *
9
 * Single source of truth for the domain, intent, and scope categories (derived from
10
 * clustering analysis) and time estimate dimensions.
11
 */
12

13
// ============================================================================
14
// INTENTS - What action the user wants
15
// ============================================================================
16

17
export const INTENT_DEFINITIONS = {
18
	explain: {
19
		description: 'Prompts asking the assistant to explain code, concepts, or technical topics. Includes requests for clarification, summaries, definitions, and step-by-step walkthroughs of implementations or workflows.',
20
		keywords: ['explanation', 'understanding', 'clarification', 'how-it-works', 'summary', 'definitions', 'step-by-step', 'guidance'],
21
	},
22
	find_content: {
23
		description: 'Prompts requesting the assistant to retrieve, read, or locate files, code references, definitions, and usage patterns within a codebase or project repository.',
24
		keywords: ['retrieve', 'read', 'file contents', 'search', 'references', 'codebase', 'locate', 'fetch'],
25
	},
26
	research: {
27
		description: 'Prompts requesting the assistant to research and investigate implementation details, usage patterns, and documentation of existing code or systems.',
28
		keywords: ['research', 'implementation details', 'documentation', 'usage patterns', 'investigation'],
29
	},
30
	review: {
31
		description: 'Prompts requesting code review, validation of implementations against requirements, analysis of code changes and quality, and identification of issues, vulnerabilities, and improvements. Covers both formal review feedback and structural/usage pattern analysis.',
32
		keywords: ['code review', 'validation', 'compliance', 'correctness', 'code quality', 'vulnerability analysis', 'code changes', 'feedback'],
33
	},
34
	generate_docs: {
35
		description: 'Prompts requesting the assistant to generate documentation, summary reports, and example or sample code.',
36
		keywords: ['documentation', 'generate', 'summary reports', 'example code', 'technical writing'],
37
	},
38
	troubleshoot_debug: {
39
		description: 'Prompts requesting help diagnosing and resolving failures, errors, bugs, and incidents. Includes troubleshooting build/code errors, root cause analysis, and investigation of test failures and operational incidents.',
40
		keywords: ['troubleshoot', 'debug', 'failure', 'error', 'root cause', 'fix', 'build errors', 'incidents', 'bugs'],
41
	},
42
	git_ops: {
43
		description: 'Prompts requesting help with Git branch operations including creating, switching, merging, rebasing branches, and resolving merge conflicts.',
44
		keywords: ['branch', 'merge', 'rebase', 'conflicts', 'commit', 'Git operations'],
45
	},
46
	run_code: {
47
		description: 'Prompts requesting the assistant to run, execute, or initiate code, scripts, commands, builds, or other defined processes.',
48
		keywords: ['execute', 'run', 'build', 'script', 'process', 'commands'],
49
	},
50
	config_mgmt: {
51
		description: 'Prompts requesting changes to application configuration, features, user interface design, or documentation, typically involving updates or modifications to existing settings and appearance.',
52
		keywords: ['configuration', 'feature updates', 'UI modification', 'settings', 'design changes'],
53
	},
54
	new_feature: {
55
		description: 'Prompts requesting the assistant to build a new user-facing feature or capability requiring coordinated code changes, typically spanning multiple files or components.',
56
		keywords: ['build', 'implement', 'add feature', 'create feature', 'set up', 'integrate', 'new capability'],
57
	},
58
	refactor: {
59
		description: 'Prompts requesting the assistant to restructure, reorganize, or improve existing code without changing its external behavior. Includes extracting functions, renaming, simplifying logic, and improving code organization.',
60
		keywords: ['refactor', 'restructure', 'reorganize', 'clean up', 'extract', 'simplify', 'rename', 'improve structure'],
61
	},
62
	data_analysis_viz: {
63
		description: 'Prompts requesting the assistant to analyze data, create visualizations, build charts or graphs, run queries, or explore datasets for insights and reporting.',
64
		keywords: ['data analysis', 'visualization', 'charts', 'graphs', 'querying', 'reporting', 'dashboards', 'data exploration'],
65
	},
66
	need_info: {
67
		description: 'Not enough information to determine the intent. The prompt may be too short, too vague, or lack sufficient context to make a determination.',
68
		keywords: [],
69
	},
70
	other: {
71
		description: 'Prompts whose intent does not fit into any of the defined categories. These may involve niche actions or mixed intents outside the taxonomy.',
72
		keywords: [],
73
	},
74
} as const satisfies Record<string, CategoryDefinition>;
75

76
// ============================================================================
77
// DOMAINS - What area of code/system (orthogonal to intents)
78
// ============================================================================
79

80
export const DOMAIN_DEFINITIONS = {
81
	cicd_cloud_infra: {
82
		description: 'Prompts involving continuous integration/deployment pipeline configuration, cloud infrastructure provisioning and automation, container orchestration, and infrastructure-as-code workflows.',
83
		keywords: ['CI/CD', 'build automation', 'deployment pipelines', 'cloud infrastructure', 'provisioning', 'IaC', 'containerization', 'configuration management', 'DevOps'],
84
	},
85
	cli_scripting: {
86
		description: 'Prompts focused on building, customizing, and automating command-line interface tools, shell scripts, and terminal workflows for developer productivity.',
87
		keywords: ['CLI', 'command-line', 'shell scripting', 'bash', 'PowerShell', 'terminal', 'task automation'],
88
	},
89
	automated_testing: {
90
		description: 'Prompts focused on automated software testing tools, frameworks, and suites spanning unit, integration, and end-to-end testing, including test coverage and workflow analysis.',
91
		keywords: ['automated testing', 'unit testing', 'integration testing', 'end-to-end testing', 'test frameworks', 'test suites', 'test coverage'],
92
	},
93
	ai_agent: {
94
		description: 'Prompts focused on designing, configuring, and orchestrating AI agents and coding assistants, including their workflows, integration architectures, and framework capabilities.',
95
		keywords: ['AI agents', 'orchestration', 'workflow automation', 'integration architecture', 'coding assistants', 'LLM integration', 'MCP'],
96
	},
97
	network_infra: {
98
		description: 'Prompts focused on configuring, deploying, and managing network infrastructure, including remote access, multi-server environments, and network security.',
99
		keywords: ['network configuration', 'server management', 'remote access', 'firewall', 'DNS', 'VPN', 'load balancing', 'routing', 'connectivity'],
100
	},
101
	project_mgmt: {
102
		description: 'Prompts related to project management, issue tracking, and task management within development workflows.',
103
		keywords: ['issue tracking', 'project management', 'task management', 'workflow management', 'project planning'],
104
	},
105
	data_pipelines: {
106
		description: 'Prompts focused on building, configuring, and orchestrating data processing pipelines that handle ingestion, transformation, and formatting of structured data across various file formats and scales.',
107
		keywords: ['data pipelines', 'ETL workflows', 'data transformation', 'file processing', 'pipeline orchestration', 'ingestion'],
108
	},
109
	web_ui: {
110
		description: 'Prompts focused on designing, building, and architecting user interface components and layouts for web application frontends.',
111
		keywords: ['UI', 'web application', 'user interface', 'frontend', 'components', 'layout', 'styling', 'responsive design'],
112
	},
113
	backend_dev: {
114
		description: 'Prompts focused on building, designing, and maintaining server-side applications, APIs, business logic, authentication, and service architectures.',
115
		keywords: ['API', 'server', 'endpoint', 'REST', 'GraphQL', 'backend', 'microservices', 'authentication', 'business logic'],
116
	},
117
	game_dev: {
118
		description: 'Prompts focused on designing, building, and testing the architecture, mechanics, and subsystems of digital and tabletop games.',
119
		keywords: ['game development', 'game engine', 'game mechanics', 'rendering', 'multiplayer', 'interactive gameplay', 'asset creation'],
120
	},
121
	package_mgmt: {
122
		description: 'Prompts focused on managing software dependencies, package installations, version control of libraries, and release workflows across programming languages and platforms.',
123
		keywords: ['dependency management', 'package managers', 'version management', 'software releases', 'dependency resolution'],
124
	},
125
	version_control: {
126
		description: 'Prompts related to managing source code repositories, version control systems, branching and merging strategies, and collaborative development workflows.',
127
		keywords: ['source code', 'repository', 'version control', 'Git', 'branching', 'merging', 'code management'],
128
	},
129
	incident_mgmt: {
130
		description: 'Prompts focused on building, integrating, and querying incident management systems for tracking, triaging, investigating, and resolving operational and security incidents.',
131
		keywords: ['incident management', 'security incidents', 'ticketing systems', 'workflow automation', 'incident response', 'triage'],
132
	},
133
	logging_observability: {
134
		description: 'Prompts focused on designing, configuring, querying, and analyzing application and system logs, including logging frameworks, log aggregation, monitoring dashboards, and observability infrastructure.',
135
		keywords: ['logging', 'log analysis', 'monitoring', 'observability', 'metrics', 'alerting', 'tracing', 'dashboards'],
136
	},
137
	database_mgmt: {
138
		description: 'Prompts focused on designing, analyzing, managing, and querying relational database schemas, including data modeling for business intelligence and data warehouse contexts.',
139
		keywords: ['database schema', 'relational database', 'data modeling', 'query design', 'schema management', 'SQL'],
140
	},
141
	ml_statistics: {
142
		description: 'Prompts focused on machine learning model development, training, evaluation, and deployment, as well as statistical analysis, data science workflows, and mathematical modeling.',
143
		keywords: ['machine learning', 'deep learning', 'neural networks', 'model training', 'statistics', 'regression', 'classification', 'data science', 'feature engineering', 'model evaluation'],
144
	},
145
	need_info: {
146
		description: 'Not enough information to determine the domain. The prompt may be too short, too vague, or lack sufficient context to make a determination.',
147
		keywords: [],
148
	},
149
	other: {
150
		description: 'Prompts that do not fit into any of the defined domain categories. These may involve niche or specialized topics outside the taxonomy.',
151
		keywords: [],
152
	},
153
} as const satisfies Record<string, CategoryDefinition>;
154

155
// ============================================================================
156
// SCOPES - What code context is needed
157
// ============================================================================
158

159
export const SCOPE_DEFINITIONS = {
160
	// File-level scopes
161
	selection: {
162
		description: 'Operates on user\'s currently selected/highlighted code',
163
		signals: ['user has active selection', 'uses "this"'],
164
	},
165
	current_file: {
166
		description: 'Entire file user is currently viewing/editing',
167
		signals: ['"this file"', 'mentions filename', 'file-level operation'],
168
	},
169
	few_files: {
170
		description: 'Small set of related files (2-5 files)',
171
		signals: ['"this component and its tests"', 'specific file mentions'],
172
	},
173
	many_files: {
174
		description: 'Large set of files or entire module/package',
175
		signals: ['"all components"', '"entire module"', '"across files"'],
176
	},
177

178
	// Repository scopes
179
	codebase: {
180
		description: 'Entire project/codebase understanding required',
181
		signals: ['"project"', '"codebase"', '"application"', '"system"', 'architecture-level'],
182
	},
183
	multi_repository: {
184
		description: 'Operates across multiple repositories (microservices, monorepo packages)',
185
		signals: ['"other repo"', '"microservice"', '"shared library"', 'cross-repo dependency', 'multi-package'],
186
	},
187

188
	// External scopes
189
	scm_operations: {
190
		description: 'Git operations, branch management, PR creation',
191
		signals: ['git commands', 'branch', 'PR', 'merge', 'rebase', 'git history', 'cherry-pick', 'git push', 'git pull', 'git fetch', 'git commit', 'git diff', 'git stash'],
192
	},
193
	issue_tracker: {
194
		description: 'Operates on issue tracking systems (GitHub Issues, JIRA, Linear)',
195
		signals: ['issue', 'bug', 'ticket', 'backlog', 'sprint', 'tracking system'],
196
	},
197
	remote_service: {
198
		description: 'Interacts with external services, APIs, cloud resources, or remote databases',
199
		signals: ['external API', 'cloud service', 'SaaS', 'third-party', 'webhook', 'staging database', 'production database', 'remote connection', 'SSH'],
200
	},
201
	external: {
202
		description: 'Requires knowledge outside the codebase (docs, web, general knowledge)',
203
		signals: ['questions about languages', 'frameworks', 'best practices', '"how to" (general)'],
204
	},
205

206
	// Transient
207
	ephemeral: {
208
		description: 'One-off task, doesn\'t directly modify main codebase',
209
		signals: ['"write a script to"', '"analyze this data"', 'temporary/throwaway work'],
210
	},
211
	unknown_scope: {
212
		description: 'Scope cannot be determined from message',
213
		signals: [],
214
	},
215
} as const satisfies Record<string, CategoryDefinition>;
216

217
// ============================================================================
218
// Shared types and utilities
219
// ============================================================================
220

221
interface CategoryDefinition {
222
	description: string;
223
	keywords?: readonly string[];
224
	examples?: readonly string[];
225
	signals?: readonly string[];
226
	notes?: string;
227
}
228

229
/** Extract keys as union type */
230
export type PromptIntent = keyof typeof INTENT_DEFINITIONS;
231
export type PromptDomain = keyof typeof DOMAIN_DEFINITIONS;
232
export type PromptScope = keyof typeof SCOPE_DEFINITIONS;
233

234
/** Validation sets - derived from definitions */
235
export const VALID_INTENTS = new Set(Object.keys(INTENT_DEFINITIONS)) as ReadonlySet<PromptIntent>;
236
export const VALID_DOMAINS = new Set(Object.keys(DOMAIN_DEFINITIONS)) as ReadonlySet<PromptDomain>;
237
export const VALID_SCOPES = new Set(Object.keys(SCOPE_DEFINITIONS)) as ReadonlySet<PromptScope>;
238

239
/** Type guards */
240
export function isValidIntent(value: string): value is PromptIntent {
241
	return VALID_INTENTS.has(value as PromptIntent);
242
}
243
export function isValidDomain(value: string): value is PromptDomain {
244
	return VALID_DOMAINS.has(value as PromptDomain);
245
}
246
export function isValidScope(value: string): value is PromptScope {
247
	return VALID_SCOPES.has(value as PromptScope);
248
}
249

250
/**
251
 * The classification result structure
252
 */
253
export interface PromptClassification {
254
	intent: PromptIntent;
255
	domain: PromptDomain;
256
	timeEstimate: {
257
		/** ISO 8601 duration for best case scenario, e.g., "PT5M" for 5 minutes */
258
		bestCase: string;
259
		/** ISO 8601 duration for realistic scenario, e.g., "PT15M" for 15 minutes */
260
		realistic: string;
261
	};
262
	scope: PromptScope;
263
	/** Confidence score between 0.0 and 1.0 */
264
	confidence: number;
265
	/** Brief reasoning for the classification */
266
	reasoning: string;
267
}
268

269
// ============================================================================
270
// Prompt generation helpers
271
// ============================================================================
272

273
function formatCategoryForPrompt(key: string, def: CategoryDefinition): string {
274
	const parts = [`### \`${key}\``, def.description];
275

276
	if (def.keywords?.length) {
277
		parts.push(`- Keywords: ${def.keywords.join(', ')}`);
278
	}
279
	if (def.signals?.length) {
280
		parts.push(`- Signals: ${def.signals.join(', ')}`);
281
	}
282
	if (def.examples?.length) {
283
		parts.push(`Examples: ${def.examples.map(e => `"${e}"`).join(', ')}`);
284
	}
285
	if (def.notes) {
286
		parts.push(def.notes);
287
	}
288

289
	return parts.join('\n');
290
}
291

292
/** Generate prompt section for intents */
293
export function generateIntentPromptSection(): string {
294
	const header = '## Intent Categories\n';
295
	const categories = Object.entries(INTENT_DEFINITIONS)
296
		.map(([key, def]) => formatCategoryForPrompt(key, def))
297
		.join('\n\n');
298
	return header + categories;
299
}
300

301
/** Generate prompt section for domains */
302
export function generateDomainPromptSection(): string {
303
	const header = '## Domain Categories\n';
304
	const categories = Object.entries(DOMAIN_DEFINITIONS)
305
		.map(([key, def]) => formatCategoryForPrompt(key, def))
306
		.join('\n\n');
307
	return header + categories;
308
}
309

310
/** Generate prompt section for scopes */
311
export function generateScopePromptSection(): string {
312
	const header = '# SCOPE - What code context is needed (choose ONE)\n';
313
	const categories = Object.entries(SCOPE_DEFINITIONS)
314
		.map(([key, def]) => formatCategoryForPrompt(key, def))
315
		.join('\n\n');
316
	return header + categories;
317
}
318

319
/** Classification guidance for the LLM */
320
const CLASSIFICATION_GUIDANCE = `# CLASSIFICATION GUIDANCE
321

322
## Domain vs Intent — these are separate dimensions
323

324
Domain and intent are independent. Classify each on its own merits. Do NOT substitute one for the other.
325

326
**Domain** is the technical subject area or problem space the user is operating in.
327
- It describes a system, architecture, technology area, or problem space — never an activity.
328
- Think of it as answering: "What area of technology is this about?"
329
- If the prompt does not clearly indicate a technical domain, use \`need_info\`.
330

331
**Intent** is the developer action or goal being performed within that domain.
332
- It describes what the user is trying to accomplish — the verb, not the noun.
333
- Think of it as answering: "What is the user trying to do?"
334
- If the prompt does not clearly indicate an intent, use \`need_info\`.
335

336
**Key rule**: A prompt about CI/CD pipelines (domain) might be asking for an explanation (intent), troubleshooting (intent), or code review (intent). Classify each dimension independently. Never let the domain influence your intent classification or vice versa.
337

338
Focus on semantic meaning, not keyword matching. Keywords are illustrative, not exhaustive.
339

340
## Pre-classification check
341
1. **What technical area does this fall into?** Match to the most specific domain category.
342
2. **If multiple domains apply**, choose the primary one — the domain that best captures what the user is actually trying to accomplish.
343
3. **What is the user trying to do?** Match to the most specific intent category.
344
4. **If multiple intents apply**, choose the primary one — the intent that best captures the user's goal.`;
345

346
/** Generate full taxonomy prompt */
347
export function generateTaxonomyPrompt(): string {
348
	return [
349
		CLASSIFICATION_GUIDANCE,
350
		generateDomainPromptSection(),
351
		generateIntentPromptSection(),
352
		'# TIME ESTIMATE',
353
		'Estimate how long an **experienced developer familiar with the codebase** would take:',
354
		'- Consider: understanding requirements, writing code, testing, debugging, code review',
355
		'- Format: ISO 8601 duration (e.g., "PT5M" for 5 minutes, "PT1H30M" for 1.5 hours)',
356
		'- Provide both "bestCase" (everything goes smoothly) and "realistic" (typical complications)',
357
		'',
358
		generateScopePromptSection(),
359
	].join('\n\n');
360
}
361

362
// ============================================================================
363
// Tool calling schema for structured output
364
// ============================================================================
365

366
/** Tool name for prompt categorization */
367
export const CATEGORIZE_PROMPT_TOOL_NAME = 'categorize_prompt';
368

369
/** JSON Schema for the categorize_prompt tool parameters */
370
export const CATEGORIZE_PROMPT_TOOL_SCHEMA = {
371
	type: 'object',
372
	additionalProperties: false,
373
	properties: {
374
		intent: {
375
			type: 'string',
376
			enum: Object.keys(INTENT_DEFINITIONS),
377
			description: 'The primary action the user wants to perform'
378
		},
379
		domain: {
380
			type: 'string',
381
			enum: Object.keys(DOMAIN_DEFINITIONS),
382
			description: 'The area of code or system the request relates to'
383
		},
384
		scope: {
385
			type: 'string',
386
			enum: Object.keys(SCOPE_DEFINITIONS),
387
			description: 'The code context required to fulfill the request'
388
		},
389
		timeEstimate: {
390
			type: 'object',
391
			additionalProperties: false,
392
			properties: {
393
				bestCase: {
394
					type: 'string',
395
					description: 'ISO 8601 duration for best case scenario (e.g., "PT5M" for 5 minutes)'
396
				},
397
				realistic: {
398
					type: 'string',
399
					description: 'ISO 8601 duration for realistic scenario (e.g., "PT15M" for 15 minutes)'
400
				}
401
			},
402
			required: ['bestCase', 'realistic']
403
		},
404
		confidence: {
405
			type: 'number',
406
			minimum: 0,
407
			maximum: 1,
408
			description: 'Confidence score between 0.0 and 1.0'
409
		},
410
		reasoning: {
411
			type: 'string',
412
			description: 'Brief 1-2 sentence explanation for the classification'
413
		}
414
	},
415
	required: ['intent', 'domain', 'scope', 'timeEstimate', 'confidence', 'reasoning']
416
} as const;
417

418
Product

Resources

Company