Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
microsoft
GitHub Repository: microsoft/vscode
Path: blob/main/src/vs/editor/standalone/common/monarch/monarchCompile.ts
3296 views
1
/*---------------------------------------------------------------------------------------------
2
* Copyright (c) Microsoft Corporation. All rights reserved.
3
* Licensed under the MIT License. See License.txt in the project root for license information.
4
*--------------------------------------------------------------------------------------------*/
5
6
/*
7
* This module only exports 'compile' which compiles a JSON language definition
8
* into a typed and checked ILexer definition.
9
*/
10
11
import { isString } from '../../../../base/common/types.js';
12
import * as monarchCommon from './monarchCommon.js';
13
import { IMonarchLanguage, IMonarchLanguageBracket } from './monarchTypes.js';
14
15
/*
16
* Type helpers
17
*
18
* Note: this is just for sanity checks on the JSON description which is
19
* helpful for the programmer. No checks are done anymore once the lexer is
20
* already 'compiled and checked'.
21
*
22
*/
23
24
function isArrayOf(elemType: (x: any) => boolean, obj: any): boolean {
25
if (!obj) {
26
return false;
27
}
28
if (!(Array.isArray(obj))) {
29
return false;
30
}
31
for (const el of obj) {
32
if (!(elemType(el))) {
33
return false;
34
}
35
}
36
return true;
37
}
38
39
function bool(prop: any, defValue: boolean): boolean {
40
if (typeof prop === 'boolean') {
41
return prop;
42
}
43
return defValue;
44
}
45
46
function string(prop: any, defValue: string): string {
47
if (typeof (prop) === 'string') {
48
return prop;
49
}
50
return defValue;
51
}
52
53
54
function arrayToHash(array: string[]): { [name: string]: true } {
55
const result: any = {};
56
for (const e of array) {
57
result[e] = true;
58
}
59
return result;
60
}
61
62
63
function createKeywordMatcher(arr: string[], caseInsensitive: boolean = false): (str: string) => boolean {
64
if (caseInsensitive) {
65
arr = arr.map(function (x) { return x.toLowerCase(); });
66
}
67
const hash = arrayToHash(arr);
68
if (caseInsensitive) {
69
return function (word) {
70
return hash[word.toLowerCase()] !== undefined && hash.hasOwnProperty(word.toLowerCase());
71
};
72
} else {
73
return function (word) {
74
return hash[word] !== undefined && hash.hasOwnProperty(word);
75
};
76
}
77
}
78
79
80
// Lexer helpers
81
82
/**
83
* Compiles a regular expression string, adding the 'i' flag if 'ignoreCase' is set, and the 'u' flag if 'unicode' is set.
84
* Also replaces @\w+ or sequences with the content of the specified attribute
85
* @\w+ replacement can be avoided by escaping `@` signs with another `@` sign.
86
* @example /@attr/ will be replaced with the value of lexer[attr]
87
* @example /@@text/ will not be replaced and will become /@text/.
88
*/
89
function compileRegExp<S extends true | false>(lexer: monarchCommon.ILexerMin, str: string, handleSn: S): S extends true ? RegExp | DynamicRegExp : RegExp;
90
function compileRegExp(lexer: monarchCommon.ILexerMin, str: string, handleSn: true | false): RegExp | DynamicRegExp {
91
// @@ must be interpreted as a literal @, so we replace all occurences of @@ with a placeholder character
92
str = str.replace(/@@/g, `\x01`);
93
94
let n = 0;
95
let hadExpansion: boolean;
96
do {
97
hadExpansion = false;
98
str = str.replace(/@(\w+)/g, function (s, attr?) {
99
hadExpansion = true;
100
let sub = '';
101
if (typeof (lexer[attr]) === 'string') {
102
sub = lexer[attr];
103
} else if (lexer[attr] && lexer[attr] instanceof RegExp) {
104
sub = lexer[attr].source;
105
} else {
106
if (lexer[attr] === undefined) {
107
throw monarchCommon.createError(lexer, 'language definition does not contain attribute \'' + attr + '\', used at: ' + str);
108
} else {
109
throw monarchCommon.createError(lexer, 'attribute reference \'' + attr + '\' must be a string, used at: ' + str);
110
}
111
}
112
return (monarchCommon.empty(sub) ? '' : '(?:' + sub + ')');
113
});
114
n++;
115
} while (hadExpansion && n < 5);
116
117
// handle escaped @@
118
str = str.replace(/\x01/g, '@');
119
120
const flags = (lexer.ignoreCase ? 'i' : '') + (lexer.unicode ? 'u' : '');
121
122
// handle $Sn
123
if (handleSn) {
124
const match = str.match(/\$[sS](\d\d?)/g);
125
if (match) {
126
let lastState: string | null = null;
127
let lastRegEx: RegExp | null = null;
128
return (state: string) => {
129
if (lastRegEx && lastState === state) {
130
return lastRegEx;
131
}
132
lastState = state;
133
lastRegEx = new RegExp(monarchCommon.substituteMatchesRe(lexer, str, state), flags);
134
return lastRegEx;
135
};
136
}
137
}
138
139
return new RegExp(str, flags);
140
}
141
142
/**
143
* Compiles guard functions for case matches.
144
* This compiles 'cases' attributes into efficient match functions.
145
*
146
*/
147
function selectScrutinee(id: string, matches: string[], state: string, num: number): string | null {
148
if (num < 0) {
149
return id;
150
}
151
if (num < matches.length) {
152
return matches[num];
153
}
154
if (num >= 100) {
155
num = num - 100;
156
const parts = state.split('.');
157
parts.unshift(state);
158
if (num < parts.length) {
159
return parts[num];
160
}
161
}
162
return null;
163
}
164
165
function createGuard(lexer: monarchCommon.ILexerMin, ruleName: string, tkey: string, val: monarchCommon.FuzzyAction): monarchCommon.IBranch {
166
// get the scrutinee and pattern
167
let scrut = -1; // -1: $!, 0-99: $n, 100+n: $Sn
168
let oppat = tkey;
169
let matches = tkey.match(/^\$(([sS]?)(\d\d?)|#)(.*)$/);
170
if (matches) {
171
if (matches[3]) { // if digits
172
scrut = parseInt(matches[3]);
173
if (matches[2]) {
174
scrut = scrut + 100; // if [sS] present
175
}
176
}
177
oppat = matches[4];
178
}
179
// get operator
180
let op = '~';
181
let pat = oppat;
182
if (!oppat || oppat.length === 0) {
183
op = '!=';
184
pat = '';
185
}
186
else if (/^\w*$/.test(pat)) { // just a word
187
op = '==';
188
}
189
else {
190
matches = oppat.match(/^(@|!@|~|!~|==|!=)(.*)$/);
191
if (matches) {
192
op = matches[1];
193
pat = matches[2];
194
}
195
}
196
197
// set the tester function
198
let tester: (s: string, id: string, matches: string[], state: string, eos: boolean) => boolean;
199
200
// special case a regexp that matches just words
201
if ((op === '~' || op === '!~') && /^(\w|\|)*$/.test(pat)) {
202
const inWords = createKeywordMatcher(pat.split('|'), lexer.ignoreCase);
203
tester = function (s) { return (op === '~' ? inWords(s) : !inWords(s)); };
204
}
205
else if (op === '@' || op === '!@') {
206
const words = lexer[pat];
207
if (!words) {
208
throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' is not defined, in rule: ' + ruleName);
209
}
210
if (!(isArrayOf(function (elem) { return (typeof (elem) === 'string'); }, words))) {
211
throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' must be an array of strings, in rule: ' + ruleName);
212
}
213
const inWords = createKeywordMatcher(words, lexer.ignoreCase);
214
tester = function (s) { return (op === '@' ? inWords(s) : !inWords(s)); };
215
}
216
else if (op === '~' || op === '!~') {
217
if (pat.indexOf('$') < 0) {
218
// precompile regular expression
219
const re = compileRegExp(lexer, '^' + pat + '$', false);
220
tester = function (s) { return (op === '~' ? re.test(s) : !re.test(s)); };
221
}
222
else {
223
tester = function (s, id, matches, state) {
224
const re = compileRegExp(lexer, '^' + monarchCommon.substituteMatches(lexer, pat, id, matches, state) + '$', false);
225
return re.test(s);
226
};
227
}
228
}
229
else { // if (op==='==' || op==='!=') {
230
if (pat.indexOf('$') < 0) {
231
const patx = monarchCommon.fixCase(lexer, pat);
232
tester = function (s) { return (op === '==' ? s === patx : s !== patx); };
233
}
234
else {
235
const patx = monarchCommon.fixCase(lexer, pat);
236
tester = function (s, id, matches, state, eos) {
237
const patexp = monarchCommon.substituteMatches(lexer, patx, id, matches, state);
238
return (op === '==' ? s === patexp : s !== patexp);
239
};
240
}
241
}
242
243
// return the branch object
244
if (scrut === -1) {
245
return {
246
name: tkey, value: val, test: function (id, matches, state, eos) {
247
return tester(id, id, matches, state, eos);
248
}
249
};
250
}
251
else {
252
return {
253
name: tkey, value: val, test: function (id, matches, state, eos) {
254
const scrutinee = selectScrutinee(id, matches, state, scrut);
255
return tester(!scrutinee ? '' : scrutinee, id, matches, state, eos);
256
}
257
};
258
}
259
}
260
261
/**
262
* Compiles an action: i.e. optimize regular expressions and case matches
263
* and do many sanity checks.
264
*
265
* This is called only during compilation but if the lexer definition
266
* contains user functions as actions (which is usually not allowed), then this
267
* may be called during lexing. It is important therefore to compile common cases efficiently
268
*/
269
function compileAction(lexer: monarchCommon.ILexerMin, ruleName: string, action: any): monarchCommon.FuzzyAction {
270
if (!action) {
271
return { token: '' };
272
}
273
else if (typeof (action) === 'string') {
274
return action; // { token: action };
275
}
276
else if (action.token || action.token === '') {
277
if (typeof (action.token) !== 'string') {
278
throw monarchCommon.createError(lexer, 'a \'token\' attribute must be of type string, in rule: ' + ruleName);
279
}
280
else {
281
// only copy specific typed fields (only happens once during compile Lexer)
282
const newAction: monarchCommon.IAction = { token: action.token };
283
if (action.token.indexOf('$') >= 0) {
284
newAction.tokenSubst = true;
285
}
286
if (typeof (action.bracket) === 'string') {
287
if (action.bracket === '@open') {
288
newAction.bracket = monarchCommon.MonarchBracket.Open;
289
} else if (action.bracket === '@close') {
290
newAction.bracket = monarchCommon.MonarchBracket.Close;
291
} else {
292
throw monarchCommon.createError(lexer, 'a \'bracket\' attribute must be either \'@open\' or \'@close\', in rule: ' + ruleName);
293
}
294
}
295
if (action.next) {
296
if (typeof (action.next) !== 'string') {
297
throw monarchCommon.createError(lexer, 'the next state must be a string value in rule: ' + ruleName);
298
}
299
else {
300
let next: string = action.next;
301
if (!/^(@pop|@push|@popall)$/.test(next)) {
302
if (next[0] === '@') {
303
next = next.substr(1); // peel off starting @ sign
304
}
305
if (next.indexOf('$') < 0) { // no dollar substitution, we can check if the state exists
306
if (!monarchCommon.stateExists(lexer, monarchCommon.substituteMatches(lexer, next, '', [], ''))) {
307
throw monarchCommon.createError(lexer, 'the next state \'' + action.next + '\' is not defined in rule: ' + ruleName);
308
}
309
}
310
}
311
newAction.next = next;
312
}
313
}
314
if (typeof (action.goBack) === 'number') {
315
newAction.goBack = action.goBack;
316
}
317
if (typeof (action.switchTo) === 'string') {
318
newAction.switchTo = action.switchTo;
319
}
320
if (typeof (action.log) === 'string') {
321
newAction.log = action.log;
322
}
323
if (typeof (action.nextEmbedded) === 'string') {
324
newAction.nextEmbedded = action.nextEmbedded;
325
lexer.usesEmbedded = true;
326
}
327
return newAction;
328
}
329
}
330
else if (Array.isArray(action)) {
331
const results: monarchCommon.FuzzyAction[] = [];
332
for (let i = 0, len = action.length; i < len; i++) {
333
results[i] = compileAction(lexer, ruleName, action[i]);
334
}
335
return { group: results };
336
}
337
else if (action.cases) {
338
// build an array of test cases
339
const cases: monarchCommon.IBranch[] = [];
340
341
let hasEmbeddedEndInCases = false;
342
// for each case, push a test function and result value
343
for (const tkey in action.cases) {
344
if (action.cases.hasOwnProperty(tkey)) {
345
const val = compileAction(lexer, ruleName, action.cases[tkey]);
346
347
// what kind of case
348
if (tkey === '@default' || tkey === '@' || tkey === '') {
349
cases.push({ test: undefined, value: val, name: tkey });
350
}
351
else if (tkey === '@eos') {
352
cases.push({ test: function (id, matches, state, eos) { return eos; }, value: val, name: tkey });
353
}
354
else {
355
cases.push(createGuard(lexer, ruleName, tkey, val)); // call separate function to avoid local variable capture
356
}
357
358
if (!hasEmbeddedEndInCases) {
359
hasEmbeddedEndInCases = !isString(val) && (val.hasEmbeddedEndInCases || ['@pop', '@popall'].includes(val.nextEmbedded || ''));
360
}
361
}
362
}
363
364
// create a matching function
365
const def = lexer.defaultToken;
366
return {
367
hasEmbeddedEndInCases,
368
test: function (id, matches, state, eos) {
369
for (const _case of cases) {
370
const didmatch = (!_case.test || _case.test(id, matches, state, eos));
371
if (didmatch) {
372
return _case.value;
373
}
374
}
375
return def;
376
}
377
};
378
}
379
else {
380
throw monarchCommon.createError(lexer, 'an action must be a string, an object with a \'token\' or \'cases\' attribute, or an array of actions; in rule: ' + ruleName);
381
}
382
}
383
384
type DynamicRegExp = (state: string) => RegExp;
385
386
/**
387
* Helper class for creating matching rules
388
*/
389
class Rule implements monarchCommon.IRule {
390
private regex: RegExp | DynamicRegExp = new RegExp('');
391
public action: monarchCommon.FuzzyAction = { token: '' };
392
public matchOnlyAtLineStart: boolean = false;
393
public name: string = '';
394
395
constructor(name: string) {
396
this.name = name;
397
}
398
399
public setRegex(lexer: monarchCommon.ILexerMin, re: string | RegExp): void {
400
let sregex: string;
401
if (typeof (re) === 'string') {
402
sregex = re;
403
}
404
else if (re instanceof RegExp) {
405
sregex = (<RegExp>re).source;
406
}
407
else {
408
throw monarchCommon.createError(lexer, 'rules must start with a match string or regular expression: ' + this.name);
409
}
410
411
this.matchOnlyAtLineStart = (sregex.length > 0 && sregex[0] === '^');
412
this.name = this.name + ': ' + sregex;
413
this.regex = compileRegExp(lexer, '^(?:' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + ')', true);
414
}
415
416
public setAction(lexer: monarchCommon.ILexerMin, act: monarchCommon.IAction) {
417
this.action = compileAction(lexer, this.name, act);
418
}
419
420
public resolveRegex(state: string): RegExp {
421
if (this.regex instanceof RegExp) {
422
return this.regex;
423
} else {
424
return this.regex(state);
425
}
426
}
427
}
428
429
/**
430
* Compiles a json description function into json where all regular expressions,
431
* case matches etc, are compiled and all include rules are expanded.
432
* We also compile the bracket definitions, supply defaults, and do many sanity checks.
433
* If the 'jsonStrict' parameter is 'false', we allow at certain locations
434
* regular expression objects and functions that get called during lexing.
435
* (Currently we have no samples that need this so perhaps we should always have
436
* jsonStrict to true).
437
*/
438
export function compile(languageId: string, json: IMonarchLanguage): monarchCommon.ILexer {
439
if (!json || typeof (json) !== 'object') {
440
throw new Error('Monarch: expecting a language definition object');
441
}
442
443
// Create our lexer
444
const lexer: monarchCommon.ILexer = {
445
languageId: languageId,
446
includeLF: bool(json.includeLF, false),
447
noThrow: false, // raise exceptions during compilation
448
maxStack: 100,
449
start: (typeof json.start === 'string' ? json.start : null),
450
ignoreCase: bool(json.ignoreCase, false),
451
unicode: bool(json.unicode, false),
452
tokenPostfix: string(json.tokenPostfix, '.' + languageId),
453
defaultToken: string(json.defaultToken, 'source'),
454
usesEmbedded: false, // becomes true if we find a nextEmbedded action
455
stateNames: {},
456
tokenizer: {},
457
brackets: []
458
};
459
460
// For calling compileAction later on
461
const lexerMin: monarchCommon.ILexerMin = <any>json;
462
lexerMin.languageId = languageId;
463
lexerMin.includeLF = lexer.includeLF;
464
lexerMin.ignoreCase = lexer.ignoreCase;
465
lexerMin.unicode = lexer.unicode;
466
lexerMin.noThrow = lexer.noThrow;
467
lexerMin.usesEmbedded = lexer.usesEmbedded;
468
lexerMin.stateNames = json.tokenizer;
469
lexerMin.defaultToken = lexer.defaultToken;
470
471
472
// Compile an array of rules into newrules where RegExp objects are created.
473
function addRules(state: string, newrules: monarchCommon.IRule[], rules: any[]) {
474
for (const rule of rules) {
475
476
let include = rule.include;
477
if (include) {
478
if (typeof (include) !== 'string') {
479
throw monarchCommon.createError(lexer, 'an \'include\' attribute must be a string at: ' + state);
480
}
481
if (include[0] === '@') {
482
include = include.substr(1); // peel off starting @
483
}
484
if (!json.tokenizer[include]) {
485
throw monarchCommon.createError(lexer, 'include target \'' + include + '\' is not defined at: ' + state);
486
}
487
addRules(state + '.' + include, newrules, json.tokenizer[include]);
488
}
489
else {
490
const newrule = new Rule(state);
491
492
// Set up new rule attributes
493
if (Array.isArray(rule) && rule.length >= 1 && rule.length <= 3) {
494
newrule.setRegex(lexerMin, rule[0]);
495
if (rule.length >= 3) {
496
if (typeof (rule[1]) === 'string') {
497
newrule.setAction(lexerMin, { token: rule[1], next: rule[2] });
498
}
499
else if (typeof (rule[1]) === 'object') {
500
const rule1 = rule[1];
501
rule1.next = rule[2];
502
newrule.setAction(lexerMin, rule1);
503
}
504
else {
505
throw monarchCommon.createError(lexer, 'a next state as the last element of a rule can only be given if the action is either an object or a string, at: ' + state);
506
}
507
}
508
else {
509
newrule.setAction(lexerMin, rule[1]);
510
}
511
}
512
else {
513
if (!rule.regex) {
514
throw monarchCommon.createError(lexer, 'a rule must either be an array, or an object with a \'regex\' or \'include\' field at: ' + state);
515
}
516
if (rule.name) {
517
if (typeof rule.name === 'string') {
518
newrule.name = rule.name;
519
}
520
}
521
if (rule.matchOnlyAtStart) {
522
newrule.matchOnlyAtLineStart = bool(rule.matchOnlyAtLineStart, false);
523
}
524
newrule.setRegex(lexerMin, rule.regex);
525
newrule.setAction(lexerMin, rule.action);
526
}
527
528
newrules.push(newrule);
529
}
530
}
531
}
532
533
// compile the tokenizer rules
534
if (!json.tokenizer || typeof (json.tokenizer) !== 'object') {
535
throw monarchCommon.createError(lexer, 'a language definition must define the \'tokenizer\' attribute as an object');
536
}
537
538
lexer.tokenizer = <any>[];
539
for (const key in json.tokenizer) {
540
if (json.tokenizer.hasOwnProperty(key)) {
541
if (!lexer.start) {
542
lexer.start = key;
543
}
544
545
const rules = json.tokenizer[key];
546
lexer.tokenizer[key] = new Array();
547
addRules('tokenizer.' + key, lexer.tokenizer[key], rules);
548
}
549
}
550
lexer.usesEmbedded = lexerMin.usesEmbedded; // can be set during compileAction
551
552
// Set simple brackets
553
if (json.brackets) {
554
if (!(Array.isArray(<any>json.brackets))) {
555
throw monarchCommon.createError(lexer, 'the \'brackets\' attribute must be defined as an array');
556
}
557
}
558
else {
559
json.brackets = [
560
{ open: '{', close: '}', token: 'delimiter.curly' },
561
{ open: '[', close: ']', token: 'delimiter.square' },
562
{ open: '(', close: ')', token: 'delimiter.parenthesis' },
563
{ open: '<', close: '>', token: 'delimiter.angle' }];
564
}
565
const brackets: IMonarchLanguageBracket[] = [];
566
for (const el of json.brackets) {
567
let desc: any = el;
568
if (desc && Array.isArray(desc) && desc.length === 3) {
569
desc = { token: desc[2], open: desc[0], close: desc[1] };
570
}
571
if (desc.open === desc.close) {
572
throw monarchCommon.createError(lexer, 'open and close brackets in a \'brackets\' attribute must be different: ' + desc.open +
573
'\n hint: use the \'bracket\' attribute if matching on equal brackets is required.');
574
}
575
if (typeof desc.open === 'string' && typeof desc.token === 'string' && typeof desc.close === 'string') {
576
brackets.push({
577
token: desc.token + lexer.tokenPostfix,
578
open: monarchCommon.fixCase(lexer, desc.open),
579
close: monarchCommon.fixCase(lexer, desc.close)
580
});
581
}
582
else {
583
throw monarchCommon.createError(lexer, 'every element in the \'brackets\' array must be a \'{open,close,token}\' object or array');
584
}
585
}
586
lexer.brackets = brackets;
587
588
// Disable throw so the syntax highlighter goes, no matter what
589
lexer.noThrow = true;
590
return lexer;
591
}
592
593