Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
allendowney
GitHub Repository: allendowney/cpython
Path: blob/main/Tools/c-analyzer/c_parser/parser/_regexes.py
12 views
1
# Regular expression patterns for C syntax.
2
#
3
# None of these patterns has any capturing. However, a number of them
4
# have capturing markers compatible with utils.set_capture_groups().
5
6
import textwrap
7
8
9
def _ind(text, level=1, edges='both'):
10
indent = ' ' * level
11
text = textwrap.indent(text, indent)
12
if edges == 'pre' or edges == 'both':
13
text = '\n' + indent + text.lstrip()
14
if edges == 'post' or edges == 'both':
15
text = text.rstrip() + '\n' + ' ' * (level - 1)
16
return text
17
18
19
#######################################
20
# general
21
22
HEX = r'(?: [0-9a-zA-Z] )'
23
24
STRING_LITERAL = textwrap.dedent(rf'''
25
(?:
26
# character literal
27
(?:
28
['] [^'] [']
29
|
30
['] \\ . [']
31
|
32
['] \\x{HEX}{HEX} [']
33
|
34
['] \\0\d\d [']
35
|
36
(?:
37
['] \\o[01]\d\d [']
38
|
39
['] \\o2[0-4]\d [']
40
|
41
['] \\o25[0-5] [']
42
)
43
)
44
|
45
# string literal
46
(?:
47
["] (?: [^"\\]* \\ . )* [^"\\]* ["]
48
)
49
# end string literal
50
)
51
''')
52
53
_KEYWORD = textwrap.dedent(r'''
54
(?:
55
\b
56
(?:
57
auto |
58
extern |
59
register |
60
static |
61
_Thread_local |
62
typedef |
63
64
const |
65
volatile |
66
67
signed |
68
unsigned |
69
char |
70
short |
71
int |
72
long |
73
float |
74
double |
75
void |
76
77
struct |
78
union |
79
enum |
80
81
goto |
82
return |
83
sizeof |
84
break |
85
continue |
86
if |
87
else |
88
for |
89
do |
90
while |
91
switch |
92
case |
93
default |
94
entry
95
)
96
\b
97
)
98
''')
99
KEYWORD = rf'''
100
# keyword
101
{_KEYWORD}
102
# end keyword
103
'''
104
_KEYWORD = ''.join(_KEYWORD.split())
105
106
IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
107
# We use a negative lookahead to filter out keywords.
108
STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
109
ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
110
111
112
#######################################
113
# types
114
115
SIMPLE_TYPE = textwrap.dedent(rf'''
116
# simple type
117
(?:
118
\b
119
(?:
120
void
121
|
122
(?: signed | unsigned ) # implies int
123
|
124
(?:
125
(?: (?: signed | unsigned ) \s+ )?
126
(?: (?: long | short ) \s+ )?
127
(?: char | short | int | long | float | double )
128
)
129
)
130
\b
131
)
132
# end simple type
133
''')
134
135
COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
136
137
138
#######################################
139
# variable declarations
140
141
_STORAGE = 'auto register static extern _Thread_local'.split()
142
STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
143
TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
144
PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
145
146
TYPE_SPEC = textwrap.dedent(rf'''
147
# type spec
148
(?:
149
{_ind(SIMPLE_TYPE, 2)}
150
|
151
(?:
152
[_]*typeof[_]*
153
\s* [(]
154
(?: \s* [*&] )*
155
\s* {STRICT_IDENTIFIER}
156
\s* [)]
157
)
158
|
159
# reference to a compound type
160
(?:
161
{COMPOUND_TYPE_KIND}
162
(?: \s* {ANON_IDENTIFIER} )?
163
)
164
|
165
# reference to a typedef
166
{STRICT_IDENTIFIER}
167
)
168
# end type spec
169
''')
170
171
DECLARATOR = textwrap.dedent(rf'''
172
# declarator (possibly abstract)
173
(?:
174
(?: {PTR_QUALIFIER} \s* )*
175
(?:
176
(?:
177
(?: # <IDENTIFIER>
178
{STRICT_IDENTIFIER}
179
)
180
# Inside the brackets is actually a "constant expression".
181
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
182
)
183
|
184
(?:
185
[(] \s*
186
(?: # <WRAPPED_IDENTIFIER>
187
{STRICT_IDENTIFIER}
188
)
189
# Inside the brackets is actually a "constant expression".
190
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
191
\s* [)]
192
)
193
|
194
# func ptr
195
(?:
196
[(] (?: \s* {PTR_QUALIFIER} )? \s*
197
(?: # <FUNC_IDENTIFIER>
198
{STRICT_IDENTIFIER}
199
)
200
# Inside the brackets is actually a "constant expression".
201
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
202
\s* [)]
203
# We allow for a single level of paren nesting in parameters.
204
\s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
205
)
206
)
207
)
208
# end declarator
209
''')
210
211
VAR_DECL = textwrap.dedent(rf'''
212
# var decl (and typedef and func return type)
213
(?:
214
(?:
215
(?: # <STORAGE>
216
{STORAGE_CLASS}
217
)
218
\s*
219
)?
220
(?:
221
(?: # <TYPE_QUAL>
222
{TYPE_QUALIFIER}
223
)
224
\s*
225
)?
226
(?:
227
(?: # <TYPE_SPEC>
228
{_ind(TYPE_SPEC, 4)}
229
)
230
)
231
\s*
232
(?:
233
(?: # <DECLARATOR>
234
{_ind(DECLARATOR, 4)}
235
)
236
)
237
)
238
# end var decl
239
''')
240
241
INITIALIZER = textwrap.dedent(rf'''
242
# initializer
243
(?:
244
(?:
245
[(]
246
# no nested parens (e.g. func ptr)
247
[^)]*
248
[)]
249
\s*
250
)?
251
(?:
252
# a string literal
253
(?:
254
(?: {_ind(STRING_LITERAL, 4)} \s* )*
255
{_ind(STRING_LITERAL, 4)}
256
)
257
|
258
259
# a simple initializer
260
(?:
261
(?:
262
[^'",;{{]*
263
{_ind(STRING_LITERAL, 4)}
264
)*
265
[^'",;{{]*
266
)
267
|
268
269
# a struct/array literal
270
(?:
271
# We only expect compound initializers with
272
# single-variable declarations.
273
{{
274
(?:
275
[^'";]*?
276
{_ind(STRING_LITERAL, 5)}
277
)*
278
[^'";]*?
279
}}
280
(?= \s* ; ) # Note this lookahead.
281
)
282
)
283
)
284
# end initializer
285
''')
286
287
288
#######################################
289
# compound type declarations
290
291
STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
292
(?:
293
# inline compound type decl
294
(?:
295
(?: # <COMPOUND_TYPE_KIND>
296
{COMPOUND_TYPE_KIND}
297
)
298
(?:
299
\s+
300
(?: # <COMPOUND_TYPE_NAME>
301
{STRICT_IDENTIFIER}
302
)
303
)?
304
\s* {{
305
)
306
|
307
(?:
308
# typed member
309
(?:
310
# Technically it doesn't have to have a type...
311
(?: # <SPECIFIER_QUALIFIER>
312
(?: {TYPE_QUALIFIER} \s* )?
313
{_ind(TYPE_SPEC, 5)}
314
)
315
(?:
316
# If it doesn't have a declarator then it will have
317
# a size and vice versa.
318
\s*
319
(?: # <DECLARATOR>
320
{_ind(DECLARATOR, 6)}
321
)
322
)?
323
)
324
325
# sized member
326
(?:
327
\s* [:] \s*
328
(?: # <SIZE>
329
# This is actually a "constant expression".
330
\d+
331
|
332
[^'",}}]+
333
)
334
)?
335
\s*
336
(?: # <ENDING>
337
[,;]
338
)
339
)
340
|
341
(?:
342
\s*
343
(?: # <CLOSE>
344
}}
345
)
346
)
347
)
348
''')
349
350
ENUM_MEMBER_DECL = textwrap.dedent(rf'''
351
(?:
352
(?:
353
\s*
354
(?: # <CLOSE>
355
}}
356
)
357
)
358
|
359
(?:
360
\s*
361
(?: # <NAME>
362
{IDENTIFIER}
363
)
364
(?:
365
\s* = \s*
366
(?: # <INIT>
367
# This is actually a "constant expression".
368
{_ind(STRING_LITERAL, 4)}
369
|
370
[^'",}}]+
371
)
372
)?
373
\s*
374
(?: # <ENDING>
375
, | }}
376
)
377
)
378
)
379
''')
380
381
382
#######################################
383
# statements
384
385
SIMPLE_STMT_BODY = textwrap.dedent(rf'''
386
# simple statement body
387
(?:
388
(?:
389
[^'"{{}};]*
390
{_ind(STRING_LITERAL, 3)}
391
)*
392
[^'"{{}};]*
393
#(?= [;{{] ) # Note this lookahead.
394
)
395
# end simple statement body
396
''')
397
SIMPLE_STMT = textwrap.dedent(rf'''
398
# simple statement
399
(?:
400
(?: # <SIMPLE_STMT>
401
# stmt-inline "initializer"
402
(?:
403
return \b
404
(?:
405
\s*
406
{_ind(INITIALIZER, 5)}
407
)?
408
)
409
|
410
# variable assignment
411
(?:
412
(?: [*] \s* )?
413
(?:
414
{STRICT_IDENTIFIER} \s*
415
(?: . | -> ) \s*
416
)*
417
{STRICT_IDENTIFIER}
418
(?: \s* \[ \s* \d+ \s* \] )?
419
\s* = \s*
420
{_ind(INITIALIZER, 4)}
421
)
422
|
423
# catchall return statement
424
(?:
425
return \b
426
(?:
427
(?:
428
[^'";]*
429
{_ind(STRING_LITERAL, 6)}
430
)*
431
\s* [^'";]*
432
)?
433
)
434
|
435
# simple statement
436
(?:
437
{_ind(SIMPLE_STMT_BODY, 4)}
438
)
439
)
440
\s*
441
(?: # <SIMPLE_ENDING>
442
;
443
)
444
)
445
# end simple statement
446
''')
447
COMPOUND_STMT = textwrap.dedent(rf'''
448
# compound statement
449
(?:
450
\b
451
(?:
452
(?:
453
(?: # <COMPOUND_BARE>
454
else | do
455
)
456
\b
457
)
458
|
459
(?:
460
(?: # <COMPOUND_LABELED>
461
(?:
462
case \b
463
(?:
464
[^'":]*
465
{_ind(STRING_LITERAL, 7)}
466
)*
467
\s* [^'":]*
468
)
469
|
470
default
471
|
472
{STRICT_IDENTIFIER}
473
)
474
\s* [:]
475
)
476
|
477
(?:
478
(?: # <COMPOUND_PAREN>
479
for | while | if | switch
480
)
481
\s* (?= [(] ) # Note this lookahead.
482
)
483
)
484
\s*
485
)
486
# end compound statement
487
''')
488
489
490
#######################################
491
# function bodies
492
493
LOCAL = textwrap.dedent(rf'''
494
(?:
495
# an empty statement
496
(?: # <EMPTY>
497
;
498
)
499
|
500
# inline type decl
501
(?:
502
(?:
503
(?: # <INLINE_LEADING>
504
[^;{{}}]+?
505
)
506
\s*
507
)?
508
(?: # <INLINE_PRE>
509
(?: {STORAGE_CLASS} \s* )?
510
(?: {TYPE_QUALIFIER} \s* )?
511
)? # </INLINE_PRE>
512
(?: # <INLINE_KIND>
513
{COMPOUND_TYPE_KIND}
514
)
515
(?:
516
\s+
517
(?: # <INLINE_NAME>
518
{STRICT_IDENTIFIER}
519
)
520
)?
521
\s* {{
522
)
523
|
524
# var decl
525
(?:
526
(?: # <STORAGE>
527
{STORAGE_CLASS}
528
)? # </STORAGE>
529
(?:
530
\s*
531
(?: # <VAR_DECL>
532
{_ind(VAR_DECL, 5)}
533
)
534
)
535
(?:
536
(?:
537
# initializer
538
# We expect only basic initializers.
539
\s* = \s*
540
(?: # <VAR_INIT>
541
{_ind(INITIALIZER, 6)}
542
)
543
)?
544
(?:
545
\s*
546
(?: # <VAR_ENDING>
547
[,;]
548
)
549
)
550
)
551
)
552
|
553
{_ind(COMPOUND_STMT, 2)}
554
|
555
# start-of-block
556
(?:
557
(?: # <BLOCK_LEADING>
558
(?:
559
[^'"{{}};]*
560
{_ind(STRING_LITERAL, 5)}
561
)*
562
[^'"{{}};]*
563
# Presumably we will not see "== {{".
564
[^\s='"{{}});]
565
\s*
566
)? # </BLOCK_LEADING>
567
(?: # <BLOCK_OPEN>
568
{{
569
)
570
)
571
|
572
{_ind(SIMPLE_STMT, 2)}
573
|
574
# end-of-block
575
(?: # <BLOCK_CLOSE>
576
}}
577
)
578
)
579
''')
580
581
LOCAL_STATICS = textwrap.dedent(rf'''
582
(?:
583
# inline type decl
584
(?:
585
(?:
586
(?: # <INLINE_LEADING>
587
[^;{{}}]+?
588
)
589
\s*
590
)?
591
(?: # <INLINE_PRE>
592
(?: {STORAGE_CLASS} \s* )?
593
(?: {TYPE_QUALIFIER} \s* )?
594
)?
595
(?: # <INLINE_KIND>
596
{COMPOUND_TYPE_KIND}
597
)
598
(?:
599
\s+
600
(?: # <INLINE_NAME>
601
{STRICT_IDENTIFIER}
602
)
603
)?
604
\s* {{
605
)
606
|
607
# var decl
608
(?:
609
# We only look for static variables.
610
(?: # <STATIC_DECL>
611
static \b
612
(?: \s* {TYPE_QUALIFIER} )?
613
\s* {_ind(TYPE_SPEC, 4)}
614
\s* {_ind(DECLARATOR, 4)}
615
)
616
\s*
617
(?:
618
(?: # <STATIC_INIT>
619
= \s*
620
{_ind(INITIALIZER, 4)}
621
\s*
622
[,;{{]
623
)
624
|
625
(?: # <STATIC_ENDING>
626
[,;]
627
)
628
)
629
)
630
|
631
# everything else
632
(?:
633
(?: # <DELIM_LEADING>
634
(?:
635
[^'"{{}};]*
636
{_ind(STRING_LITERAL, 4)}
637
)*
638
\s* [^'"{{}};]*
639
)
640
(?:
641
(?: # <BLOCK_OPEN>
642
{{
643
)
644
|
645
(?: # <BLOCK_CLOSE>
646
}}
647
)
648
|
649
(?: # <STMT_END>
650
;
651
)
652
)
653
)
654
)
655
''')
656
657
658
#######################################
659
# global declarations
660
661
GLOBAL = textwrap.dedent(rf'''
662
(?:
663
# an empty statement
664
(?: # <EMPTY>
665
;
666
)
667
|
668
669
# compound type decl (maybe inline)
670
(?:
671
(?:
672
(?: # <COMPOUND_LEADING>
673
[^;{{}}]+?
674
)
675
\s*
676
)?
677
(?: # <COMPOUND_KIND>
678
{COMPOUND_TYPE_KIND}
679
)
680
(?:
681
\s+
682
(?: # <COMPOUND_NAME>
683
{STRICT_IDENTIFIER}
684
)
685
)?
686
\s* {{
687
)
688
|
689
# bogus inline decl artifact
690
# This simplifies resolving the relative syntactic ambiguity of
691
# inline structs.
692
(?:
693
(?: # <FORWARD_KIND>
694
{COMPOUND_TYPE_KIND}
695
)
696
\s*
697
(?: # <FORWARD_NAME>
698
{ANON_IDENTIFIER}
699
)
700
(?: # <MAYBE_INLINE_ACTUAL>
701
[^=,;({{[*\]]*
702
[=,;({{]
703
)
704
)
705
|
706
707
# typedef
708
(?:
709
\b typedef \b \s*
710
(?: # <TYPEDEF_DECL>
711
{_ind(VAR_DECL, 4)}
712
)
713
(?:
714
# We expect no inline type definitions in the parameters.
715
\s* [(] \s*
716
(?: # <TYPEDEF_FUNC_PARAMS>
717
[^{{;]*
718
)
719
\s* [)]
720
)?
721
\s* ;
722
)
723
|
724
725
# func decl/definition & var decls
726
# XXX dedicated pattern for funcs (more restricted)?
727
(?:
728
(?:
729
(?: # <VAR_STORAGE>
730
{STORAGE_CLASS}
731
)
732
\s*
733
)?
734
(?:
735
(?: # <FUNC_INLINE>
736
\b inline \b
737
)
738
\s*
739
)?
740
(?: # <VAR_DECL>
741
{_ind(VAR_DECL, 4)}
742
)
743
(?:
744
# func decl / definition
745
(?:
746
(?:
747
# We expect no inline type definitions in the parameters.
748
\s* [(] \s*
749
(?: # <FUNC_PARAMS>
750
[^{{;]*
751
)
752
\s* [)] \s*
753
(?: # <FUNC_DELIM>
754
[{{;]
755
)
756
)
757
|
758
(?:
759
# This is some old-school syntax!
760
\s* [(] \s*
761
# We throw away the bare names:
762
{STRICT_IDENTIFIER}
763
(?: \s* , \s* {STRICT_IDENTIFIER} )*
764
\s* [)] \s*
765
766
# We keep the trailing param declarations:
767
(?: # <FUNC_LEGACY_PARAMS>
768
# There's at least one!
769
(?: {TYPE_QUALIFIER} \s* )?
770
{_ind(TYPE_SPEC, 7)}
771
\s*
772
{_ind(DECLARATOR, 7)}
773
\s* ;
774
(?:
775
\s*
776
(?: {TYPE_QUALIFIER} \s* )?
777
{_ind(TYPE_SPEC, 8)}
778
\s*
779
{_ind(DECLARATOR, 8)}
780
\s* ;
781
)*
782
)
783
\s* {{
784
)
785
)
786
|
787
# var / typedef
788
(?:
789
(?:
790
# initializer
791
# We expect only basic initializers.
792
\s* = \s*
793
(?: # <VAR_INIT>
794
{_ind(INITIALIZER, 6)}
795
)
796
)?
797
\s*
798
(?: # <VAR_ENDING>
799
[,;]
800
)
801
)
802
)
803
)
804
)
805
''')
806
807