CoCalc -- kdoc

GitHub Repository: torvalds/linux
Path: blob/master/tools/lib/python/kdoc/kdoc_re.py
³⁸¹⁸⁶ views
1
#!/usr/bin/env python3
2
# SPDX-License-Identifier: GPL-2.0
3
# Copyright(c) 2025: Mauro Carvalho Chehab <[email protected]>.
4

5
"""
6
Regular expression ancillary classes.
7

8
Those help caching regular expressions and do matching for kernel-doc.
9
"""
10

11
import re
12

13
# Local cache for regular expressions
14
re_cache = {}
15

16

17
class KernRe:
18
    """
19
    Helper class to simplify regex declaration and usage.
20

21
    It calls re.compile for a given pattern. It also allows adding
22
    regular expressions and define sub at class init time.
23

24
    Regular expressions can be cached via an argument, helping to speedup
25
    searches.
26
    """
27

28
    def _add_regex(self, string, flags):
29
        """
30
        Adds a new regex or reuses it from the cache.
31
        """
32
        self.regex = re_cache.get(string, None)
33
        if not self.regex:
34
            self.regex = re.compile(string, flags=flags)
35
            if self.cache:
36
                re_cache[string] = self.regex
37

38
    def __init__(self, string, cache=True, flags=0):
39
        """
40
        Compile a regular expression and initialize internal vars.
41
        """
42

43
        self.cache = cache
44
        self.last_match = None
45

46
        self._add_regex(string, flags)
47

48
    def __str__(self):
49
        """
50
        Return the regular expression pattern.
51
        """
52
        return self.regex.pattern
53

54
    def __add__(self, other):
55
        """
56
        Allows adding two regular expressions into one.
57
        """
58

59
        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
60
                  flags=self.regex.flags | other.regex.flags)
61

62
    def match(self, string):
63
        """
64
        Handles a re.match storing its results
65
        """
66

67
        self.last_match = self.regex.match(string)
68
        return self.last_match
69

70
    def search(self, string):
71
        """
72
        Handles a re.search storing its results
73
        """
74

75
        self.last_match = self.regex.search(string)
76
        return self.last_match
77

78
    def findall(self, string):
79
        """
80
        Alias to re.findall
81
        """
82

83
        return self.regex.findall(string)
84

85
    def split(self, string):
86
        """
87
        Alias to re.split
88
        """
89

90
        return self.regex.split(string)
91

92
    def sub(self, sub, string, count=0):
93
        """
94
        Alias to re.sub
95
        """
96

97
        return self.regex.sub(sub, string, count=count)
98

99
    def group(self, num):
100
        """
101
        Returns the group results of the last match
102
        """
103

104
        return self.last_match.group(num)
105

106

107
class NestedMatch:
108
    """
109
    Finding nested delimiters is hard with regular expressions. It is
110
    even harder on Python with its normal re module, as there are several
111
    advanced regular expressions that are missing.
112

113
    This is the case of this pattern:
114

115
            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
116

117
    which is used to properly match open/close parentheses of the
118
    string search STRUCT_GROUP(),
119

120
    Add a class that counts pairs of delimiters, using it to match and
121
    replace nested expressions.
122

123
    The original approach was suggested by:
124
        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
125

126
    Although I re-implemented it to make it more generic and match 3 types
127
    of delimiters. The logic checks if delimiters are paired. If not, it
128
    will ignore the search string.
129
    """
130

131
    # TODO: make NestedMatch handle multiple match groups
132
    #
133
    # Right now, regular expressions to match it are defined only up to
134
    #       the start delimiter, e.g.:
135
    #
136
    #       \bSTRUCT_GROUP\(
137
    #
138
    # is similar to: STRUCT_GROUP\((.*)\)
139
    # except that the content inside the match group is delimiter-aligned.
140
    #
141
    # The content inside parentheses is converted into a single replace
142
    # group (e.g. r`\1').
143
    #
144
    # It would be nice to change such definition to support multiple
145
    # match groups, allowing a regex equivalent to:
146
    #
147
    #   FOO\((.*), (.*), (.*)\)
148
    #
149
    # it is probably easier to define it not as a regular expression, but
150
    # with some lexical definition like:
151
    #
152
    #   FOO(arg1, arg2, arg3)
153

154
    DELIMITER_PAIRS = {
155
        '{': '}',
156
        '(': ')',
157
        '[': ']',
158
    }
159

160
    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
161

162
    def _search(self, regex, line):
163
        """
164
        Finds paired blocks for a regex that ends with a delimiter.
165

166
        The suggestion of using finditer to match pairs came from:
167
        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
168
        but I ended using a different implementation to align all three types
169
        of delimiters and seek for an initial regular expression.
170

171
        The algorithm seeks for open/close paired delimiters and places them
172
        into a stack, yielding a start/stop position of each match when the
173
        stack is zeroed.
174

175
        The algorithm should work fine for properly paired lines, but will
176
        silently ignore end delimiters that precede a start delimiter.
177
        This should be OK for kernel-doc parser, as unaligned delimiters
178
        would cause compilation errors. So, we don't need to raise exceptions
179
        to cover such issues.
180
        """
181

182
        stack = []
183

184
        for match_re in regex.finditer(line):
185
            start = match_re.start()
186
            offset = match_re.end()
187

188
            d = line[offset - 1]
189
            if d not in self.DELIMITER_PAIRS:
190
                continue
191

192
            end = self.DELIMITER_PAIRS[d]
193
            stack.append(end)
194

195
            for match in self.RE_DELIM.finditer(line[offset:]):
196
                pos = match.start() + offset
197

198
                d = line[pos]
199

200
                if d in self.DELIMITER_PAIRS:
201
                    end = self.DELIMITER_PAIRS[d]
202

203
                    stack.append(end)
204
                    continue
205

206
                # Does the end delimiter match what is expected?
207
                if stack and d == stack[-1]:
208
                    stack.pop()
209

210
                    if not stack:
211
                        yield start, offset, pos + 1
212
                        break
213

214
    def search(self, regex, line):
215
        """
216
        This is similar to re.search:
217

218
        It matches a regex that it is followed by a delimiter,
219
        returning occurrences only if all delimiters are paired.
220
        """
221

222
        for t in self._search(regex, line):
223

224
            yield line[t[0]:t[2]]
225

226
    def sub(self, regex, sub, line, count=0):
227
        """
228
        This is similar to re.sub:
229

230
        It matches a regex that it is followed by a delimiter,
231
        replacing occurrences only if all delimiters are paired.
232

233
        if r'\1' is used, it works just like re: it places there the
234
        matched paired data with the delimiter stripped.
235

236
        If count is different than zero, it will replace at most count
237
        items.
238
        """
239
        out = ""
240

241
        cur_pos = 0
242
        n = 0
243

244
        for start, end, pos in self._search(regex, line):
245
            out += line[cur_pos:start]
246

247
            # Value, ignoring start/end delimiters
248
            value = line[end:pos - 1]
249

250
            # replaces \1 at the sub string, if \1 is used there
251
            new_sub = sub
252
            new_sub = new_sub.replace(r'\1', value)
253

254
            out += new_sub
255

256
            # Drop end ';' if any
257
            if line[pos] == ';':
258
                pos += 1
259

260
            cur_pos = pos
261
            n += 1
262

263
            if count and count >= n:
264
                break
265

266
        # Append the remaining string
267
        l = len(line)
268
        out += line[cur_pos:l]
269

270
        return out
271

272
Product

Resources

Company