Path: blob/main/src/vs/editor/standalone/common/monarch/monarchCompile.ts
3296 views
/*---------------------------------------------------------------------------------------------1* Copyright (c) Microsoft Corporation. All rights reserved.2* Licensed under the MIT License. See License.txt in the project root for license information.3*--------------------------------------------------------------------------------------------*/45/*6* This module only exports 'compile' which compiles a JSON language definition7* into a typed and checked ILexer definition.8*/910import { isString } from '../../../../base/common/types.js';11import * as monarchCommon from './monarchCommon.js';12import { IMonarchLanguage, IMonarchLanguageBracket } from './monarchTypes.js';1314/*15* Type helpers16*17* Note: this is just for sanity checks on the JSON description which is18* helpful for the programmer. No checks are done anymore once the lexer is19* already 'compiled and checked'.20*21*/2223function isArrayOf(elemType: (x: any) => boolean, obj: any): boolean {24if (!obj) {25return false;26}27if (!(Array.isArray(obj))) {28return false;29}30for (const el of obj) {31if (!(elemType(el))) {32return false;33}34}35return true;36}3738function bool(prop: any, defValue: boolean): boolean {39if (typeof prop === 'boolean') {40return prop;41}42return defValue;43}4445function string(prop: any, defValue: string): string {46if (typeof (prop) === 'string') {47return prop;48}49return defValue;50}515253function arrayToHash(array: string[]): { [name: string]: true } {54const result: any = {};55for (const e of array) {56result[e] = true;57}58return result;59}606162function createKeywordMatcher(arr: string[], caseInsensitive: boolean = false): (str: string) => boolean {63if (caseInsensitive) {64arr = arr.map(function (x) { return x.toLowerCase(); });65}66const hash = arrayToHash(arr);67if (caseInsensitive) {68return function (word) {69return hash[word.toLowerCase()] !== undefined && hash.hasOwnProperty(word.toLowerCase());70};71} else {72return function (word) {73return hash[word] !== undefined && hash.hasOwnProperty(word);74};75}76}777879// Lexer helpers8081/**82* Compiles a regular expression string, adding the 'i' flag if 'ignoreCase' is set, and the 'u' flag if 'unicode' is set.83* Also replaces @\w+ or sequences with the content of the specified attribute84* @\w+ replacement can be avoided by escaping `@` signs with another `@` sign.85* @example /@attr/ will be replaced with the value of lexer[attr]86* @example /@@text/ will not be replaced and will become /@text/.87*/88function compileRegExp<S extends true | false>(lexer: monarchCommon.ILexerMin, str: string, handleSn: S): S extends true ? RegExp | DynamicRegExp : RegExp;89function compileRegExp(lexer: monarchCommon.ILexerMin, str: string, handleSn: true | false): RegExp | DynamicRegExp {90// @@ must be interpreted as a literal @, so we replace all occurences of @@ with a placeholder character91str = str.replace(/@@/g, `\x01`);9293let n = 0;94let hadExpansion: boolean;95do {96hadExpansion = false;97str = str.replace(/@(\w+)/g, function (s, attr?) {98hadExpansion = true;99let sub = '';100if (typeof (lexer[attr]) === 'string') {101sub = lexer[attr];102} else if (lexer[attr] && lexer[attr] instanceof RegExp) {103sub = lexer[attr].source;104} else {105if (lexer[attr] === undefined) {106throw monarchCommon.createError(lexer, 'language definition does not contain attribute \'' + attr + '\', used at: ' + str);107} else {108throw monarchCommon.createError(lexer, 'attribute reference \'' + attr + '\' must be a string, used at: ' + str);109}110}111return (monarchCommon.empty(sub) ? '' : '(?:' + sub + ')');112});113n++;114} while (hadExpansion && n < 5);115116// handle escaped @@117str = str.replace(/\x01/g, '@');118119const flags = (lexer.ignoreCase ? 'i' : '') + (lexer.unicode ? 'u' : '');120121// handle $Sn122if (handleSn) {123const match = str.match(/\$[sS](\d\d?)/g);124if (match) {125let lastState: string | null = null;126let lastRegEx: RegExp | null = null;127return (state: string) => {128if (lastRegEx && lastState === state) {129return lastRegEx;130}131lastState = state;132lastRegEx = new RegExp(monarchCommon.substituteMatchesRe(lexer, str, state), flags);133return lastRegEx;134};135}136}137138return new RegExp(str, flags);139}140141/**142* Compiles guard functions for case matches.143* This compiles 'cases' attributes into efficient match functions.144*145*/146function selectScrutinee(id: string, matches: string[], state: string, num: number): string | null {147if (num < 0) {148return id;149}150if (num < matches.length) {151return matches[num];152}153if (num >= 100) {154num = num - 100;155const parts = state.split('.');156parts.unshift(state);157if (num < parts.length) {158return parts[num];159}160}161return null;162}163164function createGuard(lexer: monarchCommon.ILexerMin, ruleName: string, tkey: string, val: monarchCommon.FuzzyAction): monarchCommon.IBranch {165// get the scrutinee and pattern166let scrut = -1; // -1: $!, 0-99: $n, 100+n: $Sn167let oppat = tkey;168let matches = tkey.match(/^\$(([sS]?)(\d\d?)|#)(.*)$/);169if (matches) {170if (matches[3]) { // if digits171scrut = parseInt(matches[3]);172if (matches[2]) {173scrut = scrut + 100; // if [sS] present174}175}176oppat = matches[4];177}178// get operator179let op = '~';180let pat = oppat;181if (!oppat || oppat.length === 0) {182op = '!=';183pat = '';184}185else if (/^\w*$/.test(pat)) { // just a word186op = '==';187}188else {189matches = oppat.match(/^(@|!@|~|!~|==|!=)(.*)$/);190if (matches) {191op = matches[1];192pat = matches[2];193}194}195196// set the tester function197let tester: (s: string, id: string, matches: string[], state: string, eos: boolean) => boolean;198199// special case a regexp that matches just words200if ((op === '~' || op === '!~') && /^(\w|\|)*$/.test(pat)) {201const inWords = createKeywordMatcher(pat.split('|'), lexer.ignoreCase);202tester = function (s) { return (op === '~' ? inWords(s) : !inWords(s)); };203}204else if (op === '@' || op === '!@') {205const words = lexer[pat];206if (!words) {207throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' is not defined, in rule: ' + ruleName);208}209if (!(isArrayOf(function (elem) { return (typeof (elem) === 'string'); }, words))) {210throw monarchCommon.createError(lexer, 'the @ match target \'' + pat + '\' must be an array of strings, in rule: ' + ruleName);211}212const inWords = createKeywordMatcher(words, lexer.ignoreCase);213tester = function (s) { return (op === '@' ? inWords(s) : !inWords(s)); };214}215else if (op === '~' || op === '!~') {216if (pat.indexOf('$') < 0) {217// precompile regular expression218const re = compileRegExp(lexer, '^' + pat + '$', false);219tester = function (s) { return (op === '~' ? re.test(s) : !re.test(s)); };220}221else {222tester = function (s, id, matches, state) {223const re = compileRegExp(lexer, '^' + monarchCommon.substituteMatches(lexer, pat, id, matches, state) + '$', false);224return re.test(s);225};226}227}228else { // if (op==='==' || op==='!=') {229if (pat.indexOf('$') < 0) {230const patx = monarchCommon.fixCase(lexer, pat);231tester = function (s) { return (op === '==' ? s === patx : s !== patx); };232}233else {234const patx = monarchCommon.fixCase(lexer, pat);235tester = function (s, id, matches, state, eos) {236const patexp = monarchCommon.substituteMatches(lexer, patx, id, matches, state);237return (op === '==' ? s === patexp : s !== patexp);238};239}240}241242// return the branch object243if (scrut === -1) {244return {245name: tkey, value: val, test: function (id, matches, state, eos) {246return tester(id, id, matches, state, eos);247}248};249}250else {251return {252name: tkey, value: val, test: function (id, matches, state, eos) {253const scrutinee = selectScrutinee(id, matches, state, scrut);254return tester(!scrutinee ? '' : scrutinee, id, matches, state, eos);255}256};257}258}259260/**261* Compiles an action: i.e. optimize regular expressions and case matches262* and do many sanity checks.263*264* This is called only during compilation but if the lexer definition265* contains user functions as actions (which is usually not allowed), then this266* may be called during lexing. It is important therefore to compile common cases efficiently267*/268function compileAction(lexer: monarchCommon.ILexerMin, ruleName: string, action: any): monarchCommon.FuzzyAction {269if (!action) {270return { token: '' };271}272else if (typeof (action) === 'string') {273return action; // { token: action };274}275else if (action.token || action.token === '') {276if (typeof (action.token) !== 'string') {277throw monarchCommon.createError(lexer, 'a \'token\' attribute must be of type string, in rule: ' + ruleName);278}279else {280// only copy specific typed fields (only happens once during compile Lexer)281const newAction: monarchCommon.IAction = { token: action.token };282if (action.token.indexOf('$') >= 0) {283newAction.tokenSubst = true;284}285if (typeof (action.bracket) === 'string') {286if (action.bracket === '@open') {287newAction.bracket = monarchCommon.MonarchBracket.Open;288} else if (action.bracket === '@close') {289newAction.bracket = monarchCommon.MonarchBracket.Close;290} else {291throw monarchCommon.createError(lexer, 'a \'bracket\' attribute must be either \'@open\' or \'@close\', in rule: ' + ruleName);292}293}294if (action.next) {295if (typeof (action.next) !== 'string') {296throw monarchCommon.createError(lexer, 'the next state must be a string value in rule: ' + ruleName);297}298else {299let next: string = action.next;300if (!/^(@pop|@push|@popall)$/.test(next)) {301if (next[0] === '@') {302next = next.substr(1); // peel off starting @ sign303}304if (next.indexOf('$') < 0) { // no dollar substitution, we can check if the state exists305if (!monarchCommon.stateExists(lexer, monarchCommon.substituteMatches(lexer, next, '', [], ''))) {306throw monarchCommon.createError(lexer, 'the next state \'' + action.next + '\' is not defined in rule: ' + ruleName);307}308}309}310newAction.next = next;311}312}313if (typeof (action.goBack) === 'number') {314newAction.goBack = action.goBack;315}316if (typeof (action.switchTo) === 'string') {317newAction.switchTo = action.switchTo;318}319if (typeof (action.log) === 'string') {320newAction.log = action.log;321}322if (typeof (action.nextEmbedded) === 'string') {323newAction.nextEmbedded = action.nextEmbedded;324lexer.usesEmbedded = true;325}326return newAction;327}328}329else if (Array.isArray(action)) {330const results: monarchCommon.FuzzyAction[] = [];331for (let i = 0, len = action.length; i < len; i++) {332results[i] = compileAction(lexer, ruleName, action[i]);333}334return { group: results };335}336else if (action.cases) {337// build an array of test cases338const cases: monarchCommon.IBranch[] = [];339340let hasEmbeddedEndInCases = false;341// for each case, push a test function and result value342for (const tkey in action.cases) {343if (action.cases.hasOwnProperty(tkey)) {344const val = compileAction(lexer, ruleName, action.cases[tkey]);345346// what kind of case347if (tkey === '@default' || tkey === '@' || tkey === '') {348cases.push({ test: undefined, value: val, name: tkey });349}350else if (tkey === '@eos') {351cases.push({ test: function (id, matches, state, eos) { return eos; }, value: val, name: tkey });352}353else {354cases.push(createGuard(lexer, ruleName, tkey, val)); // call separate function to avoid local variable capture355}356357if (!hasEmbeddedEndInCases) {358hasEmbeddedEndInCases = !isString(val) && (val.hasEmbeddedEndInCases || ['@pop', '@popall'].includes(val.nextEmbedded || ''));359}360}361}362363// create a matching function364const def = lexer.defaultToken;365return {366hasEmbeddedEndInCases,367test: function (id, matches, state, eos) {368for (const _case of cases) {369const didmatch = (!_case.test || _case.test(id, matches, state, eos));370if (didmatch) {371return _case.value;372}373}374return def;375}376};377}378else {379throw monarchCommon.createError(lexer, 'an action must be a string, an object with a \'token\' or \'cases\' attribute, or an array of actions; in rule: ' + ruleName);380}381}382383type DynamicRegExp = (state: string) => RegExp;384385/**386* Helper class for creating matching rules387*/388class Rule implements monarchCommon.IRule {389private regex: RegExp | DynamicRegExp = new RegExp('');390public action: monarchCommon.FuzzyAction = { token: '' };391public matchOnlyAtLineStart: boolean = false;392public name: string = '';393394constructor(name: string) {395this.name = name;396}397398public setRegex(lexer: monarchCommon.ILexerMin, re: string | RegExp): void {399let sregex: string;400if (typeof (re) === 'string') {401sregex = re;402}403else if (re instanceof RegExp) {404sregex = (<RegExp>re).source;405}406else {407throw monarchCommon.createError(lexer, 'rules must start with a match string or regular expression: ' + this.name);408}409410this.matchOnlyAtLineStart = (sregex.length > 0 && sregex[0] === '^');411this.name = this.name + ': ' + sregex;412this.regex = compileRegExp(lexer, '^(?:' + (this.matchOnlyAtLineStart ? sregex.substr(1) : sregex) + ')', true);413}414415public setAction(lexer: monarchCommon.ILexerMin, act: monarchCommon.IAction) {416this.action = compileAction(lexer, this.name, act);417}418419public resolveRegex(state: string): RegExp {420if (this.regex instanceof RegExp) {421return this.regex;422} else {423return this.regex(state);424}425}426}427428/**429* Compiles a json description function into json where all regular expressions,430* case matches etc, are compiled and all include rules are expanded.431* We also compile the bracket definitions, supply defaults, and do many sanity checks.432* If the 'jsonStrict' parameter is 'false', we allow at certain locations433* regular expression objects and functions that get called during lexing.434* (Currently we have no samples that need this so perhaps we should always have435* jsonStrict to true).436*/437export function compile(languageId: string, json: IMonarchLanguage): monarchCommon.ILexer {438if (!json || typeof (json) !== 'object') {439throw new Error('Monarch: expecting a language definition object');440}441442// Create our lexer443const lexer: monarchCommon.ILexer = {444languageId: languageId,445includeLF: bool(json.includeLF, false),446noThrow: false, // raise exceptions during compilation447maxStack: 100,448start: (typeof json.start === 'string' ? json.start : null),449ignoreCase: bool(json.ignoreCase, false),450unicode: bool(json.unicode, false),451tokenPostfix: string(json.tokenPostfix, '.' + languageId),452defaultToken: string(json.defaultToken, 'source'),453usesEmbedded: false, // becomes true if we find a nextEmbedded action454stateNames: {},455tokenizer: {},456brackets: []457};458459// For calling compileAction later on460const lexerMin: monarchCommon.ILexerMin = <any>json;461lexerMin.languageId = languageId;462lexerMin.includeLF = lexer.includeLF;463lexerMin.ignoreCase = lexer.ignoreCase;464lexerMin.unicode = lexer.unicode;465lexerMin.noThrow = lexer.noThrow;466lexerMin.usesEmbedded = lexer.usesEmbedded;467lexerMin.stateNames = json.tokenizer;468lexerMin.defaultToken = lexer.defaultToken;469470471// Compile an array of rules into newrules where RegExp objects are created.472function addRules(state: string, newrules: monarchCommon.IRule[], rules: any[]) {473for (const rule of rules) {474475let include = rule.include;476if (include) {477if (typeof (include) !== 'string') {478throw monarchCommon.createError(lexer, 'an \'include\' attribute must be a string at: ' + state);479}480if (include[0] === '@') {481include = include.substr(1); // peel off starting @482}483if (!json.tokenizer[include]) {484throw monarchCommon.createError(lexer, 'include target \'' + include + '\' is not defined at: ' + state);485}486addRules(state + '.' + include, newrules, json.tokenizer[include]);487}488else {489const newrule = new Rule(state);490491// Set up new rule attributes492if (Array.isArray(rule) && rule.length >= 1 && rule.length <= 3) {493newrule.setRegex(lexerMin, rule[0]);494if (rule.length >= 3) {495if (typeof (rule[1]) === 'string') {496newrule.setAction(lexerMin, { token: rule[1], next: rule[2] });497}498else if (typeof (rule[1]) === 'object') {499const rule1 = rule[1];500rule1.next = rule[2];501newrule.setAction(lexerMin, rule1);502}503else {504throw monarchCommon.createError(lexer, 'a next state as the last element of a rule can only be given if the action is either an object or a string, at: ' + state);505}506}507else {508newrule.setAction(lexerMin, rule[1]);509}510}511else {512if (!rule.regex) {513throw monarchCommon.createError(lexer, 'a rule must either be an array, or an object with a \'regex\' or \'include\' field at: ' + state);514}515if (rule.name) {516if (typeof rule.name === 'string') {517newrule.name = rule.name;518}519}520if (rule.matchOnlyAtStart) {521newrule.matchOnlyAtLineStart = bool(rule.matchOnlyAtLineStart, false);522}523newrule.setRegex(lexerMin, rule.regex);524newrule.setAction(lexerMin, rule.action);525}526527newrules.push(newrule);528}529}530}531532// compile the tokenizer rules533if (!json.tokenizer || typeof (json.tokenizer) !== 'object') {534throw monarchCommon.createError(lexer, 'a language definition must define the \'tokenizer\' attribute as an object');535}536537lexer.tokenizer = <any>[];538for (const key in json.tokenizer) {539if (json.tokenizer.hasOwnProperty(key)) {540if (!lexer.start) {541lexer.start = key;542}543544const rules = json.tokenizer[key];545lexer.tokenizer[key] = new Array();546addRules('tokenizer.' + key, lexer.tokenizer[key], rules);547}548}549lexer.usesEmbedded = lexerMin.usesEmbedded; // can be set during compileAction550551// Set simple brackets552if (json.brackets) {553if (!(Array.isArray(<any>json.brackets))) {554throw monarchCommon.createError(lexer, 'the \'brackets\' attribute must be defined as an array');555}556}557else {558json.brackets = [559{ open: '{', close: '}', token: 'delimiter.curly' },560{ open: '[', close: ']', token: 'delimiter.square' },561{ open: '(', close: ')', token: 'delimiter.parenthesis' },562{ open: '<', close: '>', token: 'delimiter.angle' }];563}564const brackets: IMonarchLanguageBracket[] = [];565for (const el of json.brackets) {566let desc: any = el;567if (desc && Array.isArray(desc) && desc.length === 3) {568desc = { token: desc[2], open: desc[0], close: desc[1] };569}570if (desc.open === desc.close) {571throw monarchCommon.createError(lexer, 'open and close brackets in a \'brackets\' attribute must be different: ' + desc.open +572'\n hint: use the \'bracket\' attribute if matching on equal brackets is required.');573}574if (typeof desc.open === 'string' && typeof desc.token === 'string' && typeof desc.close === 'string') {575brackets.push({576token: desc.token + lexer.tokenPostfix,577open: monarchCommon.fixCase(lexer, desc.open),578close: monarchCommon.fixCase(lexer, desc.close)579});580}581else {582throw monarchCommon.createError(lexer, 'every element in the \'brackets\' array must be a \'{open,close,token}\' object or array');583}584}585lexer.brackets = brackets;586587// Disable throw so the syntax highlighter goes, no matter what588lexer.noThrow = true;589return lexer;590}591592593