Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/lib/python/kdoc/c_lex.py
170954 views
1
#!/usr/bin/env python3
2
# SPDX-License-Identifier: GPL-2.0
3
# Copyright(c) 2025: Mauro Carvalho Chehab <[email protected]>.
4
5
"""
6
Regular expression ancillary classes.
7
8
Those help caching regular expressions and do matching for kernel-doc.
9
10
Please notice that the code here may rise exceptions to indicate bad
11
usage inside kdoc to indicate problems at the replace pattern.
12
13
Other errors are logged via log instance.
14
"""
15
16
import logging
17
import re
18
19
from copy import copy
20
21
from .kdoc_re import KernRe
22
23
log = logging.getLogger(__name__)
24
25
def tokenizer_set_log(logger, prefix = ""):
26
"""
27
Replace the module‑level logger with a LoggerAdapter that
28
prepends *prefix* to every message.
29
"""
30
global log
31
32
class PrefixAdapter(logging.LoggerAdapter):
33
"""
34
Ancillary class to set prefix on all message logs.
35
"""
36
def process(self, msg, kwargs):
37
return f"{prefix}{msg}", kwargs
38
39
# Wrap the provided logger in our adapter
40
log = PrefixAdapter(logger, {"prefix": prefix})
41
42
class CToken():
43
"""
44
Data class to define a C token.
45
"""
46
47
# Tokens that can be used by the parser. Works like an C enum.
48
49
COMMENT = 0 #: A standard C or C99 comment, including delimiter.
50
STRING = 1 #: A string, including quotation marks.
51
CHAR = 2 #: A character, including apostophes.
52
NUMBER = 3 #: A number.
53
PUNC = 4 #: A puntuation mark: / ``,`` / ``.``.
54
BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``.
55
END = 6 #: A end character: ``}`` / ``]`` / ``)``.
56
CPP = 7 #: A preprocessor macro.
57
HASH = 8 #: The hash character - useful to handle other macros.
58
OP = 9 #: A C operator (add, subtract, ...).
59
STRUCT = 10 #: A ``struct`` keyword.
60
UNION = 11 #: An ``union`` keyword.
61
ENUM = 12 #: A ``struct`` keyword.
62
TYPEDEF = 13 #: A ``typedef`` keyword.
63
NAME = 14 #: A name. Can be an ID or a type.
64
SPACE = 15 #: Any space characters, including new lines
65
ENDSTMT = 16 #: End of an statement (``;``).
66
67
BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns.
68
69
MISMATCH = 255 #: an error indicator: should never happen in practice.
70
71
# Dict to convert from an enum interger into a string.
72
_name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73
74
# Dict to convert from string to an enum-like integer value.
75
_name_to_val = {k: v for v, k in _name_by_val.items()}
76
77
@staticmethod
78
def to_name(val):
79
"""Convert from an integer value from CToken enum into a string"""
80
81
return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82
83
@staticmethod
84
def from_name(name):
85
"""Convert a string into a CToken enum value"""
86
if name in CToken._name_to_val:
87
return CToken._name_to_val[name]
88
89
return CToken.MISMATCH
90
91
92
def __init__(self, kind, value=None, pos=0,
93
brace_level=0, paren_level=0, bracket_level=0):
94
self.kind = kind
95
self.value = value
96
self.pos = pos
97
self.level = (bracket_level, paren_level, brace_level)
98
99
def __repr__(self):
100
name = self.to_name(self.kind)
101
if isinstance(self.value, str):
102
value = '"' + self.value + '"'
103
else:
104
value = self.value
105
106
return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107
108
#: Regexes to parse C code, transforming it into tokens.
109
RE_SCANNER_LIST = [
110
#
111
# Note that \s\S is different than .*, as it also catches \n
112
#
113
(CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114
115
(CToken.STRING, r'"(?:\\.|[^"\\])*"'),
116
(CToken.CHAR, r"'(?:\\.|[^'\\])'"),
117
118
(CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119
r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120
121
(CToken.ENDSTMT, r"(?:\s+;|;)"),
122
123
(CToken.PUNC, r"[,\.]"),
124
125
(CToken.BEGIN, r"[\[\(\{]"),
126
127
(CToken.END, r"[\]\)\}]"),
128
129
(CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130
131
(CToken.HASH, r"#"),
132
133
(CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134
r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135
136
(CToken.STRUCT, r"\bstruct\b"),
137
(CToken.UNION, r"\bunion\b"),
138
(CToken.ENUM, r"\benum\b"),
139
(CToken.TYPEDEF, r"\btypedef\b"),
140
141
(CToken.NAME, r"[A-Za-z_]\w*"),
142
143
(CToken.SPACE, r"\s+"),
144
145
(CToken.BACKREF, r"\\\d+"),
146
147
(CToken.MISMATCH,r"."),
148
]
149
150
def fill_re_scanner(token_list):
151
"""Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152
re_tokens = []
153
154
for kind, pattern in token_list:
155
name = CToken.to_name(kind)
156
re_tokens.append(f"(?P<{name}>{pattern})")
157
158
return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159
160
#: Handle C continuation lines.
161
RE_CONT = KernRe(r"\\\n")
162
163
RE_COMMENT_START = KernRe(r'/\*\s*')
164
165
#: tokenizer regex. Will be filled at the first CTokenizer usage.
166
RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167
168
169
class CTokenizer():
170
"""
171
Scan C statements and definitions and produce tokens.
172
173
When converted to string, it drops comments and handle public/private
174
values, respecting depth.
175
"""
176
177
# This class is inspired and follows the basic concepts of:
178
# https://docs.python.org/3/library/re.html#writing-a-tokenizer
179
180
def __init__(self, source=None):
181
"""
182
Create a regular expression to handle RE_SCANNER_LIST.
183
184
While I generally don't like using regex group naming via:
185
(?P<name>...)
186
187
in this particular case, it makes sense, as we can pick the name
188
when matching a code via RE_SCANNER.
189
"""
190
191
#
192
# Store logger to allow parser classes to re-use it
193
#
194
global log
195
self.log = log
196
197
self.tokens = []
198
199
if not source:
200
return
201
202
if isinstance(source, list):
203
self.tokens = source
204
return
205
206
#
207
# While we could just use _tokenize directly via interator,
208
# As we'll need to use the tokenizer several times inside kernel-doc
209
# to handle macro transforms, cache the results on a list, as
210
# re-using it is cheaper than having to parse everytime.
211
#
212
for tok in self._tokenize(source):
213
self.tokens.append(tok)
214
215
def _tokenize(self, source):
216
"""
217
Iterator that parses ``source``, splitting it into tokens, as defined
218
at ``self.RE_SCANNER_LIST``.
219
220
The interactor returns a CToken class object.
221
"""
222
223
# Handle continuation lines. Note that kdoc_parser already has a
224
# logic to do that. Still, let's keep it for completeness, as we might
225
# end re-using this tokenizer outsize kernel-doc some day - or we may
226
# eventually remove from there as a future cleanup.
227
source = RE_CONT.sub("", source)
228
229
brace_level = 0
230
paren_level = 0
231
bracket_level = 0
232
233
for match in RE_SCANNER.finditer(source):
234
kind = CToken.from_name(match.lastgroup)
235
pos = match.start()
236
value = match.group()
237
238
if kind == CToken.MISMATCH:
239
log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
240
elif kind == CToken.BEGIN:
241
if value == '(':
242
paren_level += 1
243
elif value == '[':
244
bracket_level += 1
245
else: # value == '{'
246
brace_level += 1
247
248
elif kind == CToken.END:
249
if value == ')' and paren_level > 0:
250
paren_level -= 1
251
elif value == ']' and bracket_level > 0:
252
bracket_level -= 1
253
elif brace_level > 0: # value == '}'
254
brace_level -= 1
255
256
yield CToken(kind, value, pos,
257
brace_level, paren_level, bracket_level)
258
259
def __str__(self):
260
out=""
261
show_stack = [True]
262
263
for i, tok in enumerate(self.tokens):
264
if tok.kind == CToken.BEGIN:
265
show_stack.append(show_stack[-1])
266
267
elif tok.kind == CToken.END:
268
prev = show_stack[-1]
269
if len(show_stack) > 1:
270
show_stack.pop()
271
272
if not prev and show_stack[-1]:
273
#
274
# Try to preserve indent
275
#
276
out += "\t" * (len(show_stack) - 1)
277
278
out += str(tok.value)
279
continue
280
281
elif tok.kind == CToken.COMMENT:
282
comment = RE_COMMENT_START.sub("", tok.value)
283
284
if comment.startswith("private:"):
285
show_stack[-1] = False
286
show = False
287
elif comment.startswith("public:"):
288
show_stack[-1] = True
289
290
continue
291
292
if not show_stack[-1]:
293
continue
294
295
if i < len(self.tokens) - 1:
296
next_tok = self.tokens[i + 1]
297
298
# Do some cleanups before ";"
299
300
if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
301
continue
302
303
if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
304
continue
305
306
out += str(tok.value)
307
308
return out
309
310
311
class CTokenArgs:
312
"""
313
Ancillary class to help using backrefs from sub matches.
314
315
If the highest backref contain a "+" at the last element,
316
the logic will be greedy, picking all other delims.
317
318
This is needed to parse struct_group macros with end with ``MEMBERS...``.
319
"""
320
def __init__(self, sub_str):
321
self.sub_groups = set()
322
self.max_group = -1
323
self.greedy = None
324
325
for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
326
group = int(m.group(1))
327
if m.group(2) == "+":
328
if self.greedy and self.greedy != group:
329
raise ValueError("There are multiple greedy patterns!")
330
self.greedy = group
331
332
self.sub_groups.add(group)
333
self.max_group = max(self.max_group, group)
334
335
if self.greedy:
336
if self.greedy != self.max_group:
337
raise ValueError("Greedy pattern is not the last one!")
338
339
sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
340
341
self.sub_str = sub_str
342
self.sub_tokeninzer = CTokenizer(sub_str)
343
344
def groups(self, new_tokenizer):
345
r"""
346
Create replacement arguments for backrefs like:
347
348
``\0``, ``\1``, ``\2``, ... ``\{number}``
349
350
It also accepts a ``+`` character to the highest backref, like
351
``\4+``. When used, the backref will be greedy, picking all other
352
arguments afterwards.
353
354
The logic is smart enough to only go up to the maximum required
355
argument, even if there are more.
356
357
If there is a backref for an argument above the limit, it will
358
raise an exception. Please notice that, on C, square brackets
359
don't have any separator on it. Trying to use ``\1``..``\n`` for
360
brackets also raise an exception.
361
"""
362
363
level = (0, 0, 0)
364
365
if self.max_group < 0:
366
return level, []
367
368
tokens = new_tokenizer.tokens
369
370
#
371
# Fill \0 with the full token contents
372
#
373
groups_list = [ [] ]
374
375
if 0 in self.sub_groups:
376
inner_level = 0
377
378
for i in range(0, len(tokens)):
379
tok = tokens[i]
380
381
if tok.kind == CToken.BEGIN:
382
inner_level += 1
383
384
#
385
# Discard first begin
386
#
387
if not groups_list[0]:
388
continue
389
elif tok.kind == CToken.END:
390
inner_level -= 1
391
if inner_level < 0:
392
break
393
394
if inner_level:
395
groups_list[0].append(tok)
396
397
if not self.max_group:
398
return level, groups_list
399
400
delim = None
401
402
#
403
# Ignore everything before BEGIN. The value of begin gives the
404
# delimiter to be used for the matches
405
#
406
for i in range(0, len(tokens)):
407
tok = tokens[i]
408
if tok.kind == CToken.BEGIN:
409
if tok.value == "{":
410
delim = ";"
411
elif tok.value == "(":
412
delim = ","
413
else:
414
self.log.error(fr"Can't handle \1..\n on {sub_str}")
415
416
level = tok.level
417
break
418
419
pos = 1
420
groups_list.append([])
421
422
inner_level = 0
423
for i in range(i + 1, len(tokens)):
424
tok = tokens[i]
425
426
if tok.kind == CToken.BEGIN:
427
inner_level += 1
428
if tok.kind == CToken.END:
429
inner_level -= 1
430
if inner_level < 0:
431
break
432
433
if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
434
pos += 1
435
if self.greedy and pos > self.max_group:
436
pos -= 1
437
else:
438
groups_list.append([])
439
440
if pos > self.max_group:
441
break
442
443
continue
444
445
groups_list[pos].append(tok)
446
447
if pos < self.max_group:
448
log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
449
450
return level, groups_list
451
452
def tokens(self, new_tokenizer):
453
level, groups = self.groups(new_tokenizer)
454
455
new = CTokenizer()
456
457
for tok in self.sub_tokeninzer.tokens:
458
if tok.kind == CToken.BACKREF:
459
group = int(tok.value[1:])
460
461
for group_tok in groups[group]:
462
new_tok = copy(group_tok)
463
464
new_level = [0, 0, 0]
465
466
for i in range(0, len(level)):
467
new_level[i] = new_tok.level[i] + level[i]
468
469
new_tok.level = tuple(new_level)
470
471
new.tokens += [ new_tok ]
472
else:
473
new.tokens += [ tok ]
474
475
return new.tokens
476
477
478
class CMatch:
479
"""
480
Finding nested delimiters is hard with regular expressions. It is
481
even harder on Python with its normal re module, as there are several
482
advanced regular expressions that are missing.
483
484
This is the case of this pattern::
485
486
'\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
487
488
which is used to properly match open/close parentheses of the
489
string search STRUCT_GROUP(),
490
491
Add a class that counts pairs of delimiters, using it to match and
492
replace nested expressions.
493
494
The original approach was suggested by:
495
496
https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
497
498
Although I re-implemented it to make it more generic and match 3 types
499
of delimiters. The logic checks if delimiters are paired. If not, it
500
will ignore the search string.
501
"""
502
503
504
def __init__(self, regex, delim="("):
505
self.regex = KernRe("^" + regex + r"\b")
506
self.start_delim = delim
507
508
def _search(self, tokenizer):
509
"""
510
Finds paired blocks for a regex that ends with a delimiter.
511
512
The suggestion of using finditer to match pairs came from:
513
https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
514
but I ended using a different implementation to align all three types
515
of delimiters and seek for an initial regular expression.
516
517
The algorithm seeks for open/close paired delimiters and places them
518
into a stack, yielding a start/stop position of each match when the
519
stack is zeroed.
520
521
The algorithm should work fine for properly paired lines, but will
522
silently ignore end delimiters that precede a start delimiter.
523
This should be OK for kernel-doc parser, as unaligned delimiters
524
would cause compilation errors. So, we don't need to raise exceptions
525
to cover such issues.
526
"""
527
528
start = None
529
started = False
530
531
import sys
532
533
stack = []
534
535
for i, tok in enumerate(tokenizer.tokens):
536
if start is None:
537
if tok.kind == CToken.NAME and self.regex.match(tok.value):
538
start = i
539
stack.append((start, tok.level))
540
started = False
541
542
continue
543
544
if not started:
545
if tok.kind == CToken.SPACE:
546
continue
547
548
if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
549
started = True
550
continue
551
552
# Name only token without BEGIN/END
553
if i > start:
554
i -= 1
555
yield start, i
556
start = None
557
558
if tok.kind == CToken.END and tok.level == stack[-1][1]:
559
start, level = stack.pop()
560
561
yield start, i
562
start = None
563
564
#
565
# If an END zeroing levels is not there, return remaining stuff
566
# This is meant to solve cases where the caller logic might be
567
# picking an incomplete block.
568
#
569
if start and stack:
570
if started:
571
s = str(tokenizer)
572
log.warning(f"can't find a final end at {s}")
573
574
yield start, len(tokenizer.tokens)
575
576
def search(self, source):
577
"""
578
This is similar to re.search:
579
580
It matches a regex that it is followed by a delimiter,
581
returning occurrences only if all delimiters are paired.
582
"""
583
584
if isinstance(source, CTokenizer):
585
tokenizer = source
586
is_token = True
587
else:
588
tokenizer = CTokenizer(source)
589
is_token = False
590
591
for start, end in self._search(tokenizer):
592
new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
593
594
if is_token:
595
yield new_tokenizer
596
else:
597
yield str(new_tokenizer)
598
599
def sub(self, sub_str, source, count=0):
600
"""
601
This is similar to re.sub:
602
603
It matches a regex that it is followed by a delimiter,
604
replacing occurrences only if all delimiters are paired.
605
606
if the sub argument contains::
607
608
r'\0'
609
610
it will work just like re: it places there the matched paired data
611
with the delimiter stripped.
612
613
If count is different than zero, it will replace at most count
614
items.
615
"""
616
if isinstance(source, CTokenizer):
617
is_token = True
618
tokenizer = source
619
else:
620
is_token = False
621
tokenizer = CTokenizer(source)
622
623
# Detect if sub_str contains sub arguments
624
625
args_match = CTokenArgs(sub_str)
626
627
new_tokenizer = CTokenizer()
628
pos = 0
629
n = 0
630
631
#
632
# NOTE: the code below doesn't consider overlays at sub.
633
# We may need to add some extra unit tests to check if those
634
# would cause problems. When replacing by "", this should not
635
# be a problem, but other transformations could be problematic
636
#
637
for start, end in self._search(tokenizer):
638
new_tokenizer.tokens += tokenizer.tokens[pos:start]
639
640
new = CTokenizer(tokenizer.tokens[start:end + 1])
641
642
new_tokenizer.tokens += args_match.tokens(new)
643
644
pos = end + 1
645
646
n += 1
647
if count and n >= count:
648
break
649
650
new_tokenizer.tokens += tokenizer.tokens[pos:]
651
652
if not is_token:
653
return str(new_tokenizer)
654
655
return new_tokenizer
656
657
def __repr__(self):
658
"""
659
Returns a displayable version of the class init.
660
"""
661
662
return f'CMatch("{self.regex.regex.pattern}")'
663
664