Path: blob/main/Tools/c-analyzer/c_parser/parser/_regexes.py
12 views
# Regular expression patterns for C syntax.1#2# None of these patterns has any capturing. However, a number of them3# have capturing markers compatible with utils.set_capture_groups().45import textwrap678def _ind(text, level=1, edges='both'):9indent = ' ' * level10text = textwrap.indent(text, indent)11if edges == 'pre' or edges == 'both':12text = '\n' + indent + text.lstrip()13if edges == 'post' or edges == 'both':14text = text.rstrip() + '\n' + ' ' * (level - 1)15return text161718#######################################19# general2021HEX = r'(?: [0-9a-zA-Z] )'2223STRING_LITERAL = textwrap.dedent(rf'''24(?:25# character literal26(?:27['] [^'] [']28|29['] \\ . [']30|31['] \\x{HEX}{HEX} [']32|33['] \\0\d\d [']34|35(?:36['] \\o[01]\d\d [']37|38['] \\o2[0-4]\d [']39|40['] \\o25[0-5] [']41)42)43|44# string literal45(?:46["] (?: [^"\\]* \\ . )* [^"\\]* ["]47)48# end string literal49)50''')5152_KEYWORD = textwrap.dedent(r'''53(?:54\b55(?:56auto |57extern |58register |59static |60_Thread_local |61typedef |6263const |64volatile |6566signed |67unsigned |68char |69short |70int |71long |72float |73double |74void |7576struct |77union |78enum |7980goto |81return |82sizeof |83break |84continue |85if |86else |87for |88do |89while |90switch |91case |92default |93entry94)95\b96)97''')98KEYWORD = rf'''99# keyword100{_KEYWORD}101# end keyword102'''103_KEYWORD = ''.join(_KEYWORD.split())104105IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'106# We use a negative lookahead to filter out keywords.107STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'108ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'109110111#######################################112# types113114SIMPLE_TYPE = textwrap.dedent(rf'''115# simple type116(?:117\b118(?:119void120|121(?: signed | unsigned ) # implies int122|123(?:124(?: (?: signed | unsigned ) \s+ )?125(?: (?: long | short ) \s+ )?126(?: char | short | int | long | float | double )127)128)129\b130)131# end simple type132''')133134COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'135136137#######################################138# variable declarations139140_STORAGE = 'auto register static extern _Thread_local'.split()141STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'142TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'143PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'144145TYPE_SPEC = textwrap.dedent(rf'''146# type spec147(?:148{_ind(SIMPLE_TYPE, 2)}149|150(?:151[_]*typeof[_]*152\s* [(]153(?: \s* [*&] )*154\s* {STRICT_IDENTIFIER}155\s* [)]156)157|158# reference to a compound type159(?:160{COMPOUND_TYPE_KIND}161(?: \s* {ANON_IDENTIFIER} )?162)163|164# reference to a typedef165{STRICT_IDENTIFIER}166)167# end type spec168''')169170DECLARATOR = textwrap.dedent(rf'''171# declarator (possibly abstract)172(?:173(?: {PTR_QUALIFIER} \s* )*174(?:175(?:176(?: # <IDENTIFIER>177{STRICT_IDENTIFIER}178)179# Inside the brackets is actually a "constant expression".180(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays181)182|183(?:184[(] \s*185(?: # <WRAPPED_IDENTIFIER>186{STRICT_IDENTIFIER}187)188# Inside the brackets is actually a "constant expression".189(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays190\s* [)]191)192|193# func ptr194(?:195[(] (?: \s* {PTR_QUALIFIER} )? \s*196(?: # <FUNC_IDENTIFIER>197{STRICT_IDENTIFIER}198)199# Inside the brackets is actually a "constant expression".200(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays201\s* [)]202# We allow for a single level of paren nesting in parameters.203\s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]204)205)206)207# end declarator208''')209210VAR_DECL = textwrap.dedent(rf'''211# var decl (and typedef and func return type)212(?:213(?:214(?: # <STORAGE>215{STORAGE_CLASS}216)217\s*218)?219(?:220(?: # <TYPE_QUAL>221{TYPE_QUALIFIER}222)223\s*224)?225(?:226(?: # <TYPE_SPEC>227{_ind(TYPE_SPEC, 4)}228)229)230\s*231(?:232(?: # <DECLARATOR>233{_ind(DECLARATOR, 4)}234)235)236)237# end var decl238''')239240INITIALIZER = textwrap.dedent(rf'''241# initializer242(?:243(?:244[(]245# no nested parens (e.g. func ptr)246[^)]*247[)]248\s*249)?250(?:251# a string literal252(?:253(?: {_ind(STRING_LITERAL, 4)} \s* )*254{_ind(STRING_LITERAL, 4)}255)256|257258# a simple initializer259(?:260(?:261[^'",;{{]*262{_ind(STRING_LITERAL, 4)}263)*264[^'",;{{]*265)266|267268# a struct/array literal269(?:270# We only expect compound initializers with271# single-variable declarations.272{{273(?:274[^'";]*?275{_ind(STRING_LITERAL, 5)}276)*277[^'";]*?278}}279(?= \s* ; ) # Note this lookahead.280)281)282)283# end initializer284''')285286287#######################################288# compound type declarations289290STRUCT_MEMBER_DECL = textwrap.dedent(rf'''291(?:292# inline compound type decl293(?:294(?: # <COMPOUND_TYPE_KIND>295{COMPOUND_TYPE_KIND}296)297(?:298\s+299(?: # <COMPOUND_TYPE_NAME>300{STRICT_IDENTIFIER}301)302)?303\s* {{304)305|306(?:307# typed member308(?:309# Technically it doesn't have to have a type...310(?: # <SPECIFIER_QUALIFIER>311(?: {TYPE_QUALIFIER} \s* )?312{_ind(TYPE_SPEC, 5)}313)314(?:315# If it doesn't have a declarator then it will have316# a size and vice versa.317\s*318(?: # <DECLARATOR>319{_ind(DECLARATOR, 6)}320)321)?322)323324# sized member325(?:326\s* [:] \s*327(?: # <SIZE>328# This is actually a "constant expression".329\d+330|331[^'",}}]+332)333)?334\s*335(?: # <ENDING>336[,;]337)338)339|340(?:341\s*342(?: # <CLOSE>343}}344)345)346)347''')348349ENUM_MEMBER_DECL = textwrap.dedent(rf'''350(?:351(?:352\s*353(?: # <CLOSE>354}}355)356)357|358(?:359\s*360(?: # <NAME>361{IDENTIFIER}362)363(?:364\s* = \s*365(?: # <INIT>366# This is actually a "constant expression".367{_ind(STRING_LITERAL, 4)}368|369[^'",}}]+370)371)?372\s*373(?: # <ENDING>374, | }}375)376)377)378''')379380381#######################################382# statements383384SIMPLE_STMT_BODY = textwrap.dedent(rf'''385# simple statement body386(?:387(?:388[^'"{{}};]*389{_ind(STRING_LITERAL, 3)}390)*391[^'"{{}};]*392#(?= [;{{] ) # Note this lookahead.393)394# end simple statement body395''')396SIMPLE_STMT = textwrap.dedent(rf'''397# simple statement398(?:399(?: # <SIMPLE_STMT>400# stmt-inline "initializer"401(?:402return \b403(?:404\s*405{_ind(INITIALIZER, 5)}406)?407)408|409# variable assignment410(?:411(?: [*] \s* )?412(?:413{STRICT_IDENTIFIER} \s*414(?: . | -> ) \s*415)*416{STRICT_IDENTIFIER}417(?: \s* \[ \s* \d+ \s* \] )?418\s* = \s*419{_ind(INITIALIZER, 4)}420)421|422# catchall return statement423(?:424return \b425(?:426(?:427[^'";]*428{_ind(STRING_LITERAL, 6)}429)*430\s* [^'";]*431)?432)433|434# simple statement435(?:436{_ind(SIMPLE_STMT_BODY, 4)}437)438)439\s*440(?: # <SIMPLE_ENDING>441;442)443)444# end simple statement445''')446COMPOUND_STMT = textwrap.dedent(rf'''447# compound statement448(?:449\b450(?:451(?:452(?: # <COMPOUND_BARE>453else | do454)455\b456)457|458(?:459(?: # <COMPOUND_LABELED>460(?:461case \b462(?:463[^'":]*464{_ind(STRING_LITERAL, 7)}465)*466\s* [^'":]*467)468|469default470|471{STRICT_IDENTIFIER}472)473\s* [:]474)475|476(?:477(?: # <COMPOUND_PAREN>478for | while | if | switch479)480\s* (?= [(] ) # Note this lookahead.481)482)483\s*484)485# end compound statement486''')487488489#######################################490# function bodies491492LOCAL = textwrap.dedent(rf'''493(?:494# an empty statement495(?: # <EMPTY>496;497)498|499# inline type decl500(?:501(?:502(?: # <INLINE_LEADING>503[^;{{}}]+?504)505\s*506)?507(?: # <INLINE_PRE>508(?: {STORAGE_CLASS} \s* )?509(?: {TYPE_QUALIFIER} \s* )?510)? # </INLINE_PRE>511(?: # <INLINE_KIND>512{COMPOUND_TYPE_KIND}513)514(?:515\s+516(?: # <INLINE_NAME>517{STRICT_IDENTIFIER}518)519)?520\s* {{521)522|523# var decl524(?:525(?: # <STORAGE>526{STORAGE_CLASS}527)? # </STORAGE>528(?:529\s*530(?: # <VAR_DECL>531{_ind(VAR_DECL, 5)}532)533)534(?:535(?:536# initializer537# We expect only basic initializers.538\s* = \s*539(?: # <VAR_INIT>540{_ind(INITIALIZER, 6)}541)542)?543(?:544\s*545(?: # <VAR_ENDING>546[,;]547)548)549)550)551|552{_ind(COMPOUND_STMT, 2)}553|554# start-of-block555(?:556(?: # <BLOCK_LEADING>557(?:558[^'"{{}};]*559{_ind(STRING_LITERAL, 5)}560)*561[^'"{{}};]*562# Presumably we will not see "== {{".563[^\s='"{{}});]564\s*565)? # </BLOCK_LEADING>566(?: # <BLOCK_OPEN>567{{568)569)570|571{_ind(SIMPLE_STMT, 2)}572|573# end-of-block574(?: # <BLOCK_CLOSE>575}}576)577)578''')579580LOCAL_STATICS = textwrap.dedent(rf'''581(?:582# inline type decl583(?:584(?:585(?: # <INLINE_LEADING>586[^;{{}}]+?587)588\s*589)?590(?: # <INLINE_PRE>591(?: {STORAGE_CLASS} \s* )?592(?: {TYPE_QUALIFIER} \s* )?593)?594(?: # <INLINE_KIND>595{COMPOUND_TYPE_KIND}596)597(?:598\s+599(?: # <INLINE_NAME>600{STRICT_IDENTIFIER}601)602)?603\s* {{604)605|606# var decl607(?:608# We only look for static variables.609(?: # <STATIC_DECL>610static \b611(?: \s* {TYPE_QUALIFIER} )?612\s* {_ind(TYPE_SPEC, 4)}613\s* {_ind(DECLARATOR, 4)}614)615\s*616(?:617(?: # <STATIC_INIT>618= \s*619{_ind(INITIALIZER, 4)}620\s*621[,;{{]622)623|624(?: # <STATIC_ENDING>625[,;]626)627)628)629|630# everything else631(?:632(?: # <DELIM_LEADING>633(?:634[^'"{{}};]*635{_ind(STRING_LITERAL, 4)}636)*637\s* [^'"{{}};]*638)639(?:640(?: # <BLOCK_OPEN>641{{642)643|644(?: # <BLOCK_CLOSE>645}}646)647|648(?: # <STMT_END>649;650)651)652)653)654''')655656657#######################################658# global declarations659660GLOBAL = textwrap.dedent(rf'''661(?:662# an empty statement663(?: # <EMPTY>664;665)666|667668# compound type decl (maybe inline)669(?:670(?:671(?: # <COMPOUND_LEADING>672[^;{{}}]+?673)674\s*675)?676(?: # <COMPOUND_KIND>677{COMPOUND_TYPE_KIND}678)679(?:680\s+681(?: # <COMPOUND_NAME>682{STRICT_IDENTIFIER}683)684)?685\s* {{686)687|688# bogus inline decl artifact689# This simplifies resolving the relative syntactic ambiguity of690# inline structs.691(?:692(?: # <FORWARD_KIND>693{COMPOUND_TYPE_KIND}694)695\s*696(?: # <FORWARD_NAME>697{ANON_IDENTIFIER}698)699(?: # <MAYBE_INLINE_ACTUAL>700[^=,;({{[*\]]*701[=,;({{]702)703)704|705706# typedef707(?:708\b typedef \b \s*709(?: # <TYPEDEF_DECL>710{_ind(VAR_DECL, 4)}711)712(?:713# We expect no inline type definitions in the parameters.714\s* [(] \s*715(?: # <TYPEDEF_FUNC_PARAMS>716[^{{;]*717)718\s* [)]719)?720\s* ;721)722|723724# func decl/definition & var decls725# XXX dedicated pattern for funcs (more restricted)?726(?:727(?:728(?: # <VAR_STORAGE>729{STORAGE_CLASS}730)731\s*732)?733(?:734(?: # <FUNC_INLINE>735\b inline \b736)737\s*738)?739(?: # <VAR_DECL>740{_ind(VAR_DECL, 4)}741)742(?:743# func decl / definition744(?:745(?:746# We expect no inline type definitions in the parameters.747\s* [(] \s*748(?: # <FUNC_PARAMS>749[^{{;]*750)751\s* [)] \s*752(?: # <FUNC_DELIM>753[{{;]754)755)756|757(?:758# This is some old-school syntax!759\s* [(] \s*760# We throw away the bare names:761{STRICT_IDENTIFIER}762(?: \s* , \s* {STRICT_IDENTIFIER} )*763\s* [)] \s*764765# We keep the trailing param declarations:766(?: # <FUNC_LEGACY_PARAMS>767# There's at least one!768(?: {TYPE_QUALIFIER} \s* )?769{_ind(TYPE_SPEC, 7)}770\s*771{_ind(DECLARATOR, 7)}772\s* ;773(?:774\s*775(?: {TYPE_QUALIFIER} \s* )?776{_ind(TYPE_SPEC, 8)}777\s*778{_ind(DECLARATOR, 8)}779\s* ;780)*781)782\s* {{783)784)785|786# var / typedef787(?:788(?:789# initializer790# We expect only basic initializers.791\s* = \s*792(?: # <VAR_INIT>793{_ind(INITIALIZER, 6)}794)795)?796\s*797(?: # <VAR_ENDING>798[,;]799)800)801)802)803)804''')805806807