CoCalc -- parse_data

GitHub Repository: torvalds/linux
Path: blob/master/tools/lib/python/kdoc/parse_data_structs.py
³⁸¹⁸⁶ views
1
#!/usr/bin/env python3
2
# SPDX-License-Identifier: GPL-2.0
3
# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <[email protected]>.
4
# pylint: disable=R0912,R0915
5

6
"""
7
Parse a source file or header, creating ReStructured Text cross references.
8

9
It accepts an optional file to change the default symbol reference or to
10
suppress symbols from the output.
11

12
It is capable of identifying defines, functions, structs, typedefs,
13
enums and enum symbols and create cross-references for all of them.
14
It is also capable of distinguish #define used for specifying a Linux
15
ioctl.
16

17
The optional rules file contains a set of rules like:
18

19
    ignore ioctl VIDIOC_ENUM_FMT
20
    replace ioctl VIDIOC_DQBUF vidioc_qbuf
21
    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
22
"""
23

24
import os
25
import re
26
import sys
27

28

29
class ParseDataStructs:
30
    """
31
    Creates an enriched version of a Kernel header file with cross-links
32
    to each C data structure type.
33

34
    It is meant to allow having a more comprehensive documentation, where
35
    uAPI headers will create cross-reference links to the code.
36

37
    It is capable of identifying defines, functions, structs, typedefs,
38
    enums and enum symbols and create cross-references for all of them.
39
    It is also capable of distinguish #define used for specifying a Linux
40
    ioctl.
41

42
    By default, it create rules for all symbols and defines, but it also
43
    allows parsing an exception file. Such file contains a set of rules
44
    using the syntax below:
45

46
    1. Ignore rules:
47

48
        ignore <type> <symbol>`
49

50
    Removes the symbol from reference generation.
51

52
    2. Replace rules:
53

54
        replace <type> <old_symbol> <new_reference>
55

56
       Replaces how old_symbol with a new reference. The new_reference can be:
57

58
        - A simple symbol name;
59
        - A full Sphinx reference.
60

61
    3. Namespace rules
62

63
        namespace <namespace>
64

65
       Sets C namespace to be used during cross-reference generation. Can
66
       be overridden by replace rules.
67

68
    On ignore and replace rules, <type> can be:
69
        - ioctl: for defines that end with _IO*, e.g. ioctl definitions
70
        - define: for other defines
71
        - symbol: for symbols defined within enums;
72
        - typedef: for typedefs;
73
        - enum: for the name of a non-anonymous enum;
74
        - struct: for structs.
75

76
    Examples:
77

78
        ignore define __LINUX_MEDIA_H
79
        ignore ioctl VIDIOC_ENUM_FMT
80
        replace ioctl VIDIOC_DQBUF vidioc_qbuf
81
        replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
82

83
        namespace MC
84
    """
85

86
    # Parser regexes with multiple ways to capture enums and structs
87
    RE_ENUMS = [
88
        re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
89
        re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
90
        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
91
        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
92
    ]
93
    RE_STRUCTS = [
94
        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
95
        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
96
        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
97
        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
98
    ]
99

100
    # FIXME: the original code was written a long time before Sphinx C
101
    # domain to have multiple namespaces. To avoid to much turn at the
102
    # existing hyperlinks, the code kept using "c:type" instead of the
103
    # right types. To change that, we need to change the types not only
104
    # here, but also at the uAPI media documentation.
105
    DEF_SYMBOL_TYPES = {
106
        "ioctl": {
107
            "prefix": "\\ ",
108
            "suffix": "\\ ",
109
            "ref_type": ":ref",
110
            "description": "IOCTL Commands",
111
        },
112
        "define": {
113
            "prefix": "\\ ",
114
            "suffix": "\\ ",
115
            "ref_type": ":ref",
116
            "description": "Macros and Definitions",
117
        },
118
        # We're calling each definition inside an enum as "symbol"
119
        "symbol": {
120
            "prefix": "\\ ",
121
            "suffix": "\\ ",
122
            "ref_type": ":ref",
123
            "description": "Enumeration values",
124
        },
125
        "typedef": {
126
            "prefix": "\\ ",
127
            "suffix": "\\ ",
128
            "ref_type": ":c:type",
129
            "description": "Type Definitions",
130
        },
131
        # This is the description of the enum itself
132
        "enum": {
133
            "prefix": "\\ ",
134
            "suffix": "\\ ",
135
            "ref_type": ":c:type",
136
            "description": "Enumerations",
137
        },
138
        "struct": {
139
            "prefix": "\\ ",
140
            "suffix": "\\ ",
141
            "ref_type": ":c:type",
142
            "description": "Structures",
143
        },
144
    }
145

146
    def __init__(self, debug: bool = False):
147
        """Initialize internal vars"""
148
        self.debug = debug
149
        self.data = ""
150

151
        self.symbols = {}
152

153
        self.namespace = None
154
        self.ignore = []
155
        self.replace = []
156

157
        for symbol_type in self.DEF_SYMBOL_TYPES:
158
            self.symbols[symbol_type] = {}
159

160
    def read_exceptions(self, fname: str):
161
        if not fname:
162
            return
163

164
        name = os.path.basename(fname)
165

166
        with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
167
            for ln, line in enumerate(f):
168
                ln += 1
169
                line = line.strip()
170
                if not line or line.startswith("#"):
171
                    continue
172

173
                # ignore rules
174
                match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
175

176
                if match:
177
                    self.ignore.append((ln, match.group(1), match.group(2)))
178
                    continue
179

180
                # replace rules
181
                match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
182
                if match:
183
                    self.replace.append((ln, match.group(1), match.group(2),
184
                                         match.group(3)))
185
                    continue
186

187
                match = re.match(r"^namespace\s+(\S+)", line)
188
                if match:
189
                    self.namespace = match.group(1)
190
                    continue
191

192
                sys.exit(f"{name}:{ln}: invalid line: {line}")
193

194
    def apply_exceptions(self):
195
        """
196
        Process exceptions file with rules to ignore or replace references.
197
        """
198

199
        # Handle ignore rules
200
        for ln, c_type, symbol in self.ignore:
201
            if c_type not in self.DEF_SYMBOL_TYPES:
202
                sys.exit(f"{name}:{ln}: {c_type} is invalid")
203

204
            d = self.symbols[c_type]
205
            if symbol in d:
206
                del d[symbol]
207

208
        # Handle replace rules
209
        for ln, c_type, old, new in self.replace:
210
            if c_type not in self.DEF_SYMBOL_TYPES:
211
                sys.exit(f"{name}:{ln}: {c_type} is invalid")
212

213
            reftype = None
214

215
            # Parse reference type when the type is specified
216

217
            match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new)
218
            if match:
219
                reftype = f":c:{match.group(1)}"
220
                new = match.group(2)
221
            else:
222
                match = re.search(r"(\:ref)\:\`(.+)\`", new)
223
                if match:
224
                    reftype = match.group(1)
225
                    new = match.group(2)
226

227
            # If the replacement rule doesn't have a type, get default
228
            if not reftype:
229
                reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
230
                if not reftype:
231
                    reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
232

233
            new_ref = f"{reftype}:`{old} <{new}>`"
234

235
            # Change self.symbols to use the replacement rule
236
            if old in self.symbols[c_type]:
237
                (_, ln) = self.symbols[c_type][old]
238
                self.symbols[c_type][old] = (new_ref, ln)
239
            else:
240
                print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
241

242
    def store_type(self, ln, symbol_type: str, symbol: str,
243
                   ref_name: str = None, replace_underscores: bool = True):
244
        """
245
        Stores a new symbol at self.symbols under symbol_type.
246

247
        By default, underscores are replaced by "-"
248
        """
249
        defs = self.DEF_SYMBOL_TYPES[symbol_type]
250

251
        prefix = defs.get("prefix", "")
252
        suffix = defs.get("suffix", "")
253
        ref_type = defs.get("ref_type")
254

255
        # Determine ref_link based on symbol type
256
        if ref_type or self.namespace:
257
            if not ref_name:
258
                ref_name = symbol.lower()
259

260
            # c-type references don't support hash
261
            if ref_type == ":ref" and replace_underscores:
262
                ref_name = ref_name.replace("_", "-")
263

264
            # C domain references may have namespaces
265
            if ref_type.startswith(":c:"):
266
                if self.namespace:
267
                    ref_name = f"{self.namespace}.{ref_name}"
268

269
            if ref_type:
270
                ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
271
            else:
272
                ref_link = f"`{symbol} <{ref_name}>`"
273
        else:
274
            ref_link = symbol
275

276
        self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln)
277

278
    def store_line(self, line):
279
        """Stores a line at self.data, properly indented"""
280
        line = "    " + line.expandtabs()
281
        self.data += line.rstrip(" ")
282

283
    def parse_file(self, file_in: str, exceptions: str = None):
284
        """Reads a C source file and get identifiers"""
285
        self.data = ""
286
        is_enum = False
287
        is_comment = False
288
        multiline = ""
289

290
        self.read_exceptions(exceptions)
291

292
        with open(file_in, "r",
293
                  encoding="utf-8", errors="backslashreplace") as f:
294
            for line_no, line in enumerate(f):
295
                self.store_line(line)
296
                line = line.strip("\n")
297

298
                # Handle continuation lines
299
                if line.endswith(r"\\"):
300
                    multiline += line[-1]
301
                    continue
302

303
                if multiline:
304
                    line = multiline + line
305
                    multiline = ""
306

307
                # Handle comments. They can be multilined
308
                if not is_comment:
309
                    if re.search(r"/\*.*", line):
310
                        is_comment = True
311
                    else:
312
                        # Strip C99-style comments
313
                        line = re.sub(r"(//.*)", "", line)
314

315
                if is_comment:
316
                    if re.search(r".*\*/", line):
317
                        is_comment = False
318
                    else:
319
                        multiline = line
320
                        continue
321

322
                # At this point, line variable may be a multilined statement,
323
                # if lines end with \ or if they have multi-line comments
324
                # With that, it can safely remove the entire comments,
325
                # and there's no need to use re.DOTALL for the logic below
326

327
                line = re.sub(r"(/\*.*\*/)", "", line)
328
                if not line.strip():
329
                    continue
330

331
                # It can be useful for debug purposes to print the file after
332
                # having comments stripped and multi-lines grouped.
333
                if self.debug > 1:
334
                    print(f"line {line_no + 1}: {line}")
335

336
                # Now the fun begins: parse each type and store it.
337

338
                # We opted for a two parsing logic here due to:
339
                # 1. it makes easier to debug issues not-parsed symbols;
340
                # 2. we want symbol replacement at the entire content, not
341
                #    just when the symbol is detected.
342

343
                if is_enum:
344
                    match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
345
                    if match:
346
                        self.store_type(line_no, "symbol", match.group(1))
347
                    if "}" in line:
348
                        is_enum = False
349
                    continue
350

351
                match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
352
                if match:
353
                    self.store_type(line_no, "ioctl", match.group(1),
354
                                    replace_underscores=False)
355
                    continue
356

357
                match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
358
                if match:
359
                    self.store_type(line_no, "define", match.group(1))
360
                    continue
361

362
                match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
363
                                 line)
364
                if match:
365
                    name = match.group(2).strip()
366
                    symbol = match.group(3)
367
                    self.store_type(line_no, "typedef", symbol, ref_name=name)
368
                    continue
369

370
                for re_enum in self.RE_ENUMS:
371
                    match = re_enum.match(line)
372
                    if match:
373
                        self.store_type(line_no, "enum", match.group(1))
374
                        is_enum = True
375
                        break
376

377
                for re_struct in self.RE_STRUCTS:
378
                    match = re_struct.match(line)
379
                    if match:
380
                        self.store_type(line_no, "struct", match.group(1))
381
                        break
382

383
        self.apply_exceptions()
384

385
    def debug_print(self):
386
        """
387
        Print debug information containing the replacement rules per symbol.
388
        To make easier to check, group them per type.
389
        """
390
        if not self.debug:
391
            return
392

393
        for c_type, refs in self.symbols.items():
394
            if not refs:  # Skip empty dictionaries
395
                continue
396

397
            print(f"{c_type}:")
398

399
            for symbol, (ref, ln) in sorted(refs.items()):
400
                print(f"  #{ln:<5d} {symbol} -> {ref}")
401

402
            print()
403

404
    def gen_output(self):
405
        """Write the formatted output to a file."""
406

407
        # Avoid extra blank lines
408
        text = re.sub(r"\s+$", "", self.data) + "\n"
409
        text = re.sub(r"\n\s+\n", "\n\n", text)
410

411
        # Escape Sphinx special characters
412
        text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
413

414
        # Source uAPI files may have special notes. Use bold font for them
415
        text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
416

417
        # Delimiters to catch the entire symbol after escaped
418
        start_delim = r"([ \n\t\(=\*\@])"
419
        end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
420

421
        # Process all reference types
422
        for ref_dict in self.symbols.values():
423
            for symbol, (replacement, _) in ref_dict.items():
424
                symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
425
                text = re.sub(fr'{start_delim}{symbol}{end_delim}',
426
                              fr'\1{replacement}\2', text)
427

428
        # Remove "\ " where not needed: before spaces and at the end of lines
429
        text = re.sub(r"\\ ([\n ])", r"\1", text)
430
        text = re.sub(r" \\ ", " ", text)
431

432
        return text
433

434
    def gen_toc(self):
435
        """
436
        Create a list of symbols to be part of a TOC contents table
437
        """
438
        text = []
439

440
        # Sort symbol types per description
441
        symbol_descriptions = []
442
        for k, v in self.DEF_SYMBOL_TYPES.items():
443
            symbol_descriptions.append((v['description'], k))
444

445
        symbol_descriptions.sort()
446

447
        # Process each category
448
        for description, c_type in symbol_descriptions:
449

450
            refs = self.symbols[c_type]
451
            if not refs:  # Skip empty categories
452
                continue
453

454
            text.append(f"{description}")
455
            text.append("-" * len(description))
456
            text.append("")
457

458
            # Sort symbols alphabetically
459
            for symbol, (ref, ln) in sorted(refs.items()):
460
                text.append(f"- LINENO_{ln}: {ref}")
461

462
            text.append("")  # Add empty line between categories
463

464
        return "\n".join(text)
465

466
    def write_output(self, file_in: str, file_out: str, toc: bool):
467
        title = os.path.basename(file_in)
468

469
        if toc:
470
            text = self.gen_toc()
471
        else:
472
            text = self.gen_output()
473

474
        with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
475
            f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
476
            f.write(f"{title}\n")
477
            f.write("=" * len(title) + "\n\n")
478

479
            if not toc:
480
                f.write(".. parsed-literal::\n\n")
481

482
            f.write(text)
483

484
Product

Resources

Company