CoCalc -- c

GitHub Repository: torvalds/linux
Path: blob/master/tools/lib/python/kdoc/c_lex.py
¹⁷⁰⁹⁵⁴ views
1
#!/usr/bin/env python3
2
# SPDX-License-Identifier: GPL-2.0
3
# Copyright(c) 2025: Mauro Carvalho Chehab <[email protected]>.
4

5
"""
6
Regular expression ancillary classes.
7

8
Those help caching regular expressions and do matching for kernel-doc.
9

10
Please notice that the code here may rise exceptions to indicate bad
11
usage inside kdoc to indicate problems at the replace pattern.
12

13
Other errors are logged via log instance.
14
"""
15

16
import logging
17
import re
18

19
from copy import copy
20

21
from .kdoc_re import KernRe
22

23
log = logging.getLogger(__name__)
24

25
def tokenizer_set_log(logger, prefix = ""):
26
    """
27
    Replace the module‑level logger with a LoggerAdapter that
28
    prepends *prefix* to every message.
29
    """
30
    global log
31

32
    class PrefixAdapter(logging.LoggerAdapter):
33
        """
34
        Ancillary class to set prefix on all message logs.
35
        """
36
        def process(self, msg, kwargs):
37
            return f"{prefix}{msg}", kwargs
38

39
    # Wrap the provided logger in our adapter
40
    log = PrefixAdapter(logger, {"prefix": prefix})
41

42
class CToken():
43
    """
44
    Data class to define a C token.
45
    """
46

47
    # Tokens that can be used by the parser. Works like an C enum.
48

49
    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
50
    STRING = 1      #: A string, including quotation marks.
51
    CHAR = 2        #: A character, including apostophes.
52
    NUMBER = 3      #: A number.
53
    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
54
    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
55
    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
56
    CPP = 7         #: A preprocessor macro.
57
    HASH = 8        #: The hash character - useful to handle other macros.
58
    OP = 9          #: A C operator (add, subtract, ...).
59
    STRUCT = 10     #: A ``struct`` keyword.
60
    UNION = 11      #: An ``union`` keyword.
61
    ENUM = 12       #: A ``struct`` keyword.
62
    TYPEDEF = 13    #: A ``typedef`` keyword.
63
    NAME = 14       #: A name. Can be an ID or a type.
64
    SPACE = 15      #: Any space characters, including new lines
65
    ENDSTMT = 16    #: End of an statement (``;``).
66

67
    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
68

69
    MISMATCH = 255  #: an error indicator: should never happen in practice.
70

71
    # Dict to convert from an enum interger into a string.
72
    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73

74
    # Dict to convert from string to an enum-like integer value.
75
    _name_to_val = {k: v for v, k in _name_by_val.items()}
76

77
    @staticmethod
78
    def to_name(val):
79
        """Convert from an integer value from CToken enum into a string"""
80

81
        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82

83
    @staticmethod
84
    def from_name(name):
85
        """Convert a string into a CToken enum value"""
86
        if name in CToken._name_to_val:
87
            return CToken._name_to_val[name]
88

89
        return CToken.MISMATCH
90

91

92
    def __init__(self, kind, value=None, pos=0,
93
                 brace_level=0, paren_level=0, bracket_level=0):
94
        self.kind = kind
95
        self.value = value
96
        self.pos = pos
97
        self.level = (bracket_level, paren_level, brace_level)
98

99
    def __repr__(self):
100
        name = self.to_name(self.kind)
101
        if isinstance(self.value, str):
102
            value = '"' + self.value + '"'
103
        else:
104
            value = self.value
105

106
        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107

108
#: Regexes to parse C code, transforming it into tokens.
109
RE_SCANNER_LIST = [
110
    #
111
    # Note that \s\S is different than .*, as it also catches \n
112
    #
113
    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114

115
    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
116
    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
117

118
    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119
                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120

121
    (CToken.ENDSTMT, r"(?:\s+;|;)"),
122

123
    (CToken.PUNC,    r"[,\.]"),
124

125
    (CToken.BEGIN,   r"[\[\(\{]"),
126

127
    (CToken.END,     r"[\]\)\}]"),
128

129
    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130

131
    (CToken.HASH,    r"#"),
132

133
    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134
                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135

136
    (CToken.STRUCT,  r"\bstruct\b"),
137
    (CToken.UNION,   r"\bunion\b"),
138
    (CToken.ENUM,    r"\benum\b"),
139
    (CToken.TYPEDEF, r"\btypedef\b"),
140

141
    (CToken.NAME,    r"[A-Za-z_]\w*"),
142

143
    (CToken.SPACE,   r"\s+"),
144

145
    (CToken.BACKREF, r"\\\d+"),
146

147
    (CToken.MISMATCH,r"."),
148
]
149

150
def fill_re_scanner(token_list):
151
    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152
    re_tokens = []
153

154
    for kind, pattern in token_list:
155
        name = CToken.to_name(kind)
156
        re_tokens.append(f"(?P<{name}>{pattern})")
157

158
    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159

160
#: Handle C continuation lines.
161
RE_CONT = KernRe(r"\\\n")
162

163
RE_COMMENT_START = KernRe(r'/\*\s*')
164

165
#: tokenizer regex. Will be filled at the first CTokenizer usage.
166
RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167

168

169
class CTokenizer():
170
    """
171
    Scan C statements and definitions and produce tokens.
172

173
    When converted to string, it drops comments and handle public/private
174
    values, respecting depth.
175
    """
176

177
    # This class is inspired and follows the basic concepts of:
178
    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
179

180
    def __init__(self, source=None):
181
        """
182
        Create a regular expression to handle RE_SCANNER_LIST.
183

184
        While I generally don't like using regex group naming via:
185
            (?P<name>...)
186

187
        in this particular case, it makes sense, as we can pick the name
188
        when matching a code via RE_SCANNER.
189
        """
190

191
        #
192
        # Store logger to allow parser classes to re-use it
193
        #
194
        global log
195
        self.log = log
196

197
        self.tokens = []
198

199
        if not source:
200
            return
201

202
        if isinstance(source, list):
203
            self.tokens = source
204
            return
205

206
        #
207
        # While we could just use _tokenize directly via interator,
208
        # As we'll need to use the tokenizer several times inside kernel-doc
209
        # to handle macro transforms, cache the results on a list, as
210
        # re-using it is cheaper than having to parse everytime.
211
        #
212
        for tok in self._tokenize(source):
213
            self.tokens.append(tok)
214

215
    def _tokenize(self, source):
216
        """
217
        Iterator that parses ``source``, splitting it into tokens, as defined
218
        at ``self.RE_SCANNER_LIST``.
219

220
        The interactor returns a CToken class object.
221
        """
222

223
        # Handle continuation lines. Note that kdoc_parser already has a
224
        # logic to do that. Still, let's keep it for completeness, as we might
225
        # end re-using this tokenizer outsize kernel-doc some day - or we may
226
        # eventually remove from there as a future cleanup.
227
        source = RE_CONT.sub("", source)
228

229
        brace_level = 0
230
        paren_level = 0
231
        bracket_level = 0
232

233
        for match in RE_SCANNER.finditer(source):
234
            kind = CToken.from_name(match.lastgroup)
235
            pos = match.start()
236
            value = match.group()
237

238
            if kind == CToken.MISMATCH:
239
                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
240
            elif kind == CToken.BEGIN:
241
                if value == '(':
242
                    paren_level += 1
243
                elif value == '[':
244
                    bracket_level += 1
245
                else:  # value == '{'
246
                    brace_level += 1
247

248
            elif kind == CToken.END:
249
                if value == ')' and paren_level > 0:
250
                    paren_level -= 1
251
                elif value == ']' and bracket_level > 0:
252
                    bracket_level -= 1
253
                elif brace_level > 0:    # value == '}'
254
                    brace_level -= 1
255

256
            yield CToken(kind, value, pos,
257
                         brace_level, paren_level, bracket_level)
258

259
    def __str__(self):
260
        out=""
261
        show_stack = [True]
262

263
        for i, tok in enumerate(self.tokens):
264
            if tok.kind == CToken.BEGIN:
265
                show_stack.append(show_stack[-1])
266

267
            elif tok.kind == CToken.END:
268
                prev = show_stack[-1]
269
                if len(show_stack) > 1:
270
                    show_stack.pop()
271

272
                if not prev and show_stack[-1]:
273
                    #
274
                    # Try to preserve indent
275
                    #
276
                    out += "\t" * (len(show_stack) - 1)
277

278
                    out += str(tok.value)
279
                    continue
280

281
            elif tok.kind == CToken.COMMENT:
282
                comment = RE_COMMENT_START.sub("", tok.value)
283

284
                if comment.startswith("private:"):
285
                    show_stack[-1] = False
286
                    show = False
287
                elif comment.startswith("public:"):
288
                    show_stack[-1] = True
289

290
                continue
291

292
            if not show_stack[-1]:
293
                continue
294

295
            if i < len(self.tokens) - 1:
296
                next_tok = self.tokens[i + 1]
297

298
                # Do some cleanups before ";"
299

300
                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
301
                    continue
302

303
                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
304
                    continue
305

306
            out += str(tok.value)
307

308
        return out
309

310

311
class CTokenArgs:
312
    """
313
    Ancillary class to help using backrefs from sub matches.
314

315
    If the highest backref contain a "+" at the last element,
316
    the logic will be greedy, picking all other delims.
317

318
    This is needed to parse struct_group macros with end with ``MEMBERS...``.
319
    """
320
    def __init__(self, sub_str):
321
        self.sub_groups = set()
322
        self.max_group = -1
323
        self.greedy = None
324

325
        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
326
            group = int(m.group(1))
327
            if m.group(2) == "+":
328
                if self.greedy and self.greedy != group:
329
                    raise ValueError("There are multiple greedy patterns!")
330
                self.greedy = group
331

332
            self.sub_groups.add(group)
333
            self.max_group = max(self.max_group, group)
334

335
        if self.greedy:
336
            if self.greedy != self.max_group:
337
                raise ValueError("Greedy pattern is not the last one!")
338

339
            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
340

341
        self.sub_str = sub_str
342
        self.sub_tokeninzer = CTokenizer(sub_str)
343

344
    def groups(self, new_tokenizer):
345
        r"""
346
        Create replacement arguments for backrefs like:
347

348
        ``\0``, ``\1``, ``\2``, ... ``\{number}``
349

350
        It also accepts a ``+`` character to the highest backref, like
351
        ``\4+``. When used, the backref will be greedy, picking all other
352
        arguments afterwards.
353

354
        The logic is smart enough to only go up to the maximum required
355
        argument, even if there are more.
356

357
        If there is a backref for an argument above the limit, it will
358
        raise an exception. Please notice that, on C, square brackets
359
        don't have any separator on it. Trying to use ``\1``..``\n`` for
360
        brackets also raise an exception.
361
        """
362

363
        level = (0, 0, 0)
364

365
        if self.max_group < 0:
366
            return level, []
367

368
        tokens = new_tokenizer.tokens
369

370
        #
371
        # Fill \0 with the full token contents
372
        #
373
        groups_list = [ [] ]
374

375
        if 0 in self.sub_groups:
376
            inner_level = 0
377

378
            for i in range(0, len(tokens)):
379
                tok = tokens[i]
380

381
                if tok.kind == CToken.BEGIN:
382
                    inner_level += 1
383

384
                    #
385
                    # Discard first begin
386
                    #
387
                    if not groups_list[0]:
388
                        continue
389
                elif tok.kind == CToken.END:
390
                    inner_level -= 1
391
                    if inner_level < 0:
392
                        break
393

394
                if inner_level:
395
                    groups_list[0].append(tok)
396

397
        if not self.max_group:
398
            return level, groups_list
399

400
        delim = None
401

402
        #
403
        # Ignore everything before BEGIN. The value of begin gives the
404
        # delimiter to be used for the matches
405
        #
406
        for i in range(0, len(tokens)):
407
            tok = tokens[i]
408
            if tok.kind == CToken.BEGIN:
409
                if tok.value == "{":
410
                    delim = ";"
411
                elif tok.value == "(":
412
                    delim = ","
413
                else:
414
                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
415

416
                level = tok.level
417
                break
418

419
        pos = 1
420
        groups_list.append([])
421

422
        inner_level = 0
423
        for i in range(i + 1, len(tokens)):
424
            tok = tokens[i]
425

426
            if tok.kind == CToken.BEGIN:
427
                inner_level += 1
428
            if tok.kind == CToken.END:
429
                inner_level -= 1
430
                if inner_level < 0:
431
                    break
432

433
            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
434
                pos += 1
435
                if self.greedy and pos > self.max_group:
436
                    pos -= 1
437
                else:
438
                    groups_list.append([])
439

440
                    if pos > self.max_group:
441
                        break
442

443
                    continue
444

445
            groups_list[pos].append(tok)
446

447
        if pos < self.max_group:
448
            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
449

450
        return level, groups_list
451

452
    def tokens(self, new_tokenizer):
453
        level, groups = self.groups(new_tokenizer)
454

455
        new = CTokenizer()
456

457
        for tok in self.sub_tokeninzer.tokens:
458
            if tok.kind == CToken.BACKREF:
459
                group = int(tok.value[1:])
460

461
                for group_tok in groups[group]:
462
                    new_tok = copy(group_tok)
463

464
                    new_level = [0, 0, 0]
465

466
                    for i in range(0, len(level)):
467
                        new_level[i] = new_tok.level[i] + level[i]
468

469
                    new_tok.level = tuple(new_level)
470

471
                    new.tokens += [ new_tok ]
472
            else:
473
                new.tokens += [ tok ]
474

475
        return new.tokens
476

477

478
class CMatch:
479
    """
480
    Finding nested delimiters is hard with regular expressions. It is
481
    even harder on Python with its normal re module, as there are several
482
    advanced regular expressions that are missing.
483

484
    This is the case of this pattern::
485

486
            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
487

488
    which is used to properly match open/close parentheses of the
489
    string search STRUCT_GROUP(),
490

491
    Add a class that counts pairs of delimiters, using it to match and
492
    replace nested expressions.
493

494
    The original approach was suggested by:
495

496
        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
497

498
    Although I re-implemented it to make it more generic and match 3 types
499
    of delimiters. The logic checks if delimiters are paired. If not, it
500
    will ignore the search string.
501
    """
502

503

504
    def __init__(self, regex, delim="("):
505
        self.regex = KernRe("^" + regex + r"\b")
506
        self.start_delim = delim
507

508
    def _search(self, tokenizer):
509
        """
510
        Finds paired blocks for a regex that ends with a delimiter.
511

512
        The suggestion of using finditer to match pairs came from:
513
        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
514
        but I ended using a different implementation to align all three types
515
        of delimiters and seek for an initial regular expression.
516

517
        The algorithm seeks for open/close paired delimiters and places them
518
        into a stack, yielding a start/stop position of each match when the
519
        stack is zeroed.
520

521
        The algorithm should work fine for properly paired lines, but will
522
        silently ignore end delimiters that precede a start delimiter.
523
        This should be OK for kernel-doc parser, as unaligned delimiters
524
        would cause compilation errors. So, we don't need to raise exceptions
525
        to cover such issues.
526
        """
527

528
        start = None
529
        started = False
530

531
        import sys
532

533
        stack = []
534

535
        for i, tok in enumerate(tokenizer.tokens):
536
            if start is None:
537
                if tok.kind == CToken.NAME and self.regex.match(tok.value):
538
                    start = i
539
                    stack.append((start, tok.level))
540
                    started = False
541

542
                continue
543

544
            if not started:
545
                if tok.kind == CToken.SPACE:
546
                    continue
547

548
                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
549
                    started = True
550
                    continue
551

552
                # Name only token without BEGIN/END
553
                if i > start:
554
                    i -= 1
555
                yield start, i
556
                start = None
557

558
            if tok.kind == CToken.END and tok.level == stack[-1][1]:
559
                start, level = stack.pop()
560

561
                yield start, i
562
                start = None
563

564
        #
565
        # If an END zeroing levels is not there, return remaining stuff
566
        # This is meant to solve cases where the caller logic might be
567
        # picking an incomplete block.
568
        #
569
        if start and stack:
570
            if started:
571
                s = str(tokenizer)
572
                log.warning(f"can't find a final end at {s}")
573

574
            yield start, len(tokenizer.tokens)
575

576
    def search(self, source):
577
        """
578
        This is similar to re.search:
579

580
        It matches a regex that it is followed by a delimiter,
581
        returning occurrences only if all delimiters are paired.
582
        """
583

584
        if isinstance(source, CTokenizer):
585
            tokenizer = source
586
            is_token = True
587
        else:
588
            tokenizer = CTokenizer(source)
589
            is_token = False
590

591
        for start, end in self._search(tokenizer):
592
            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
593

594
            if is_token:
595
                yield new_tokenizer
596
            else:
597
                yield str(new_tokenizer)
598

599
    def sub(self, sub_str, source, count=0):
600
        """
601
        This is similar to re.sub:
602

603
        It matches a regex that it is followed by a delimiter,
604
        replacing occurrences only if all delimiters are paired.
605

606
        if the sub argument contains::
607

608
            r'\0'
609

610
        it will work just like re: it places there the matched paired data
611
        with the delimiter stripped.
612

613
        If count is different than zero, it will replace at most count
614
        items.
615
        """
616
        if isinstance(source, CTokenizer):
617
            is_token = True
618
            tokenizer = source
619
        else:
620
            is_token = False
621
            tokenizer = CTokenizer(source)
622

623
        # Detect if sub_str contains sub arguments
624

625
        args_match = CTokenArgs(sub_str)
626

627
        new_tokenizer = CTokenizer()
628
        pos = 0
629
        n = 0
630

631
        #
632
        # NOTE: the code below doesn't consider overlays at sub.
633
        # We may need to add some extra unit tests to check if those
634
        # would cause problems. When replacing by "", this should not
635
        # be a problem, but other transformations could be problematic
636
        #
637
        for start, end in self._search(tokenizer):
638
            new_tokenizer.tokens += tokenizer.tokens[pos:start]
639

640
            new = CTokenizer(tokenizer.tokens[start:end + 1])
641

642
            new_tokenizer.tokens += args_match.tokens(new)
643

644
            pos = end + 1
645

646
            n += 1
647
            if count and n >= count:
648
                break
649

650
        new_tokenizer.tokens += tokenizer.tokens[pos:]
651

652
        if not is_token:
653
            return str(new_tokenizer)
654

655
        return new_tokenizer
656

657
    def __repr__(self):
658
        """
659
        Returns a displayable version of the class init.
660
        """
661

662
        return f'CMatch("{self.regex.regex.pattern}")'
663

664
Product

Resources

Company