Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/tools/lib/python/kdoc/kdoc_re.py
122941 views
1
#!/usr/bin/env python3
2
# SPDX-License-Identifier: GPL-2.0
3
# Copyright(c) 2025: Mauro Carvalho Chehab <[email protected]>.
4
5
"""
6
Regular expression ancillary classes.
7
8
Those help caching regular expressions and do matching for kernel-doc.
9
"""
10
11
import re
12
13
# Local cache for regular expressions
14
re_cache = {}
15
16
17
class KernRe:
18
"""
19
Helper class to simplify regex declaration and usage.
20
21
It calls re.compile for a given pattern. It also allows adding
22
regular expressions and define sub at class init time.
23
24
Regular expressions can be cached via an argument, helping to speedup
25
searches.
26
"""
27
28
def _add_regex(self, string, flags):
29
"""
30
Adds a new regex or reuses it from the cache.
31
"""
32
self.regex = re_cache.get(string, None)
33
if not self.regex:
34
self.regex = re.compile(string, flags=flags)
35
if self.cache:
36
re_cache[string] = self.regex
37
38
def __init__(self, string, cache=True, flags=0):
39
"""
40
Compile a regular expression and initialize internal vars.
41
"""
42
43
self.cache = cache
44
self.last_match = None
45
46
self._add_regex(string, flags)
47
48
def __str__(self):
49
"""
50
Return the regular expression pattern.
51
"""
52
return self.regex.pattern
53
54
def __repr__(self):
55
return f're.compile("{self.regex.pattern}")'
56
57
def __add__(self, other):
58
"""
59
Allows adding two regular expressions into one.
60
"""
61
62
return KernRe(str(self) + str(other), cache=self.cache or other.cache,
63
flags=self.regex.flags | other.regex.flags)
64
65
def match(self, string):
66
"""
67
Handles a re.match storing its results.
68
"""
69
70
self.last_match = self.regex.match(string)
71
return self.last_match
72
73
def search(self, string):
74
"""
75
Handles a re.search storing its results.
76
"""
77
78
self.last_match = self.regex.search(string)
79
return self.last_match
80
81
def findall(self, string):
82
"""
83
Alias to re.findall.
84
"""
85
86
return self.regex.findall(string)
87
88
def split(self, string):
89
"""
90
Alias to re.split.
91
"""
92
93
return self.regex.split(string)
94
95
def sub(self, sub, string, count=0):
96
"""
97
Alias to re.sub.
98
"""
99
100
return self.regex.sub(sub, string, count=count)
101
102
def group(self, num):
103
"""
104
Returns the group results of the last match.
105
"""
106
107
return self.last_match.group(num)
108
109
110
class NestedMatch:
111
"""
112
Finding nested delimiters is hard with regular expressions. It is
113
even harder on Python with its normal re module, as there are several
114
advanced regular expressions that are missing.
115
116
This is the case of this pattern::
117
118
'\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
119
120
which is used to properly match open/close parentheses of the
121
string search STRUCT_GROUP(),
122
123
Add a class that counts pairs of delimiters, using it to match and
124
replace nested expressions.
125
126
The original approach was suggested by:
127
128
https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
129
130
Although I re-implemented it to make it more generic and match 3 types
131
of delimiters. The logic checks if delimiters are paired. If not, it
132
will ignore the search string.
133
"""
134
135
# TODO: make NestedMatch handle multiple match groups
136
#
137
# Right now, regular expressions to match it are defined only up to
138
# the start delimiter, e.g.:
139
#
140
# \bSTRUCT_GROUP\(
141
#
142
# is similar to: STRUCT_GROUP\((.*)\)
143
# except that the content inside the match group is delimiter-aligned.
144
#
145
# The content inside parentheses is converted into a single replace
146
# group (e.g. r`\1').
147
#
148
# It would be nice to change such definition to support multiple
149
# match groups, allowing a regex equivalent to:
150
#
151
# FOO\((.*), (.*), (.*)\)
152
#
153
# it is probably easier to define it not as a regular expression, but
154
# with some lexical definition like:
155
#
156
# FOO(arg1, arg2, arg3)
157
158
DELIMITER_PAIRS = {
159
'{': '}',
160
'(': ')',
161
'[': ']',
162
}
163
164
RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
165
166
def _search(self, regex, line):
167
"""
168
Finds paired blocks for a regex that ends with a delimiter.
169
170
The suggestion of using finditer to match pairs came from:
171
https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
172
but I ended using a different implementation to align all three types
173
of delimiters and seek for an initial regular expression.
174
175
The algorithm seeks for open/close paired delimiters and places them
176
into a stack, yielding a start/stop position of each match when the
177
stack is zeroed.
178
179
The algorithm should work fine for properly paired lines, but will
180
silently ignore end delimiters that precede a start delimiter.
181
This should be OK for kernel-doc parser, as unaligned delimiters
182
would cause compilation errors. So, we don't need to raise exceptions
183
to cover such issues.
184
"""
185
186
stack = []
187
188
for match_re in regex.finditer(line):
189
start = match_re.start()
190
offset = match_re.end()
191
192
d = line[offset - 1]
193
if d not in self.DELIMITER_PAIRS:
194
continue
195
196
end = self.DELIMITER_PAIRS[d]
197
stack.append(end)
198
199
for match in self.RE_DELIM.finditer(line[offset:]):
200
pos = match.start() + offset
201
202
d = line[pos]
203
204
if d in self.DELIMITER_PAIRS:
205
end = self.DELIMITER_PAIRS[d]
206
207
stack.append(end)
208
continue
209
210
# Does the end delimiter match what is expected?
211
if stack and d == stack[-1]:
212
stack.pop()
213
214
if not stack:
215
yield start, offset, pos + 1
216
break
217
218
def search(self, regex, line):
219
"""
220
This is similar to re.search:
221
222
It matches a regex that it is followed by a delimiter,
223
returning occurrences only if all delimiters are paired.
224
"""
225
226
for t in self._search(regex, line):
227
228
yield line[t[0]:t[2]]
229
230
def sub(self, regex, sub, line, count=0):
231
r"""
232
This is similar to re.sub:
233
234
It matches a regex that it is followed by a delimiter,
235
replacing occurrences only if all delimiters are paired.
236
237
if the sub argument contains::
238
239
r'\1'
240
241
it will work just like re: it places there the matched paired data
242
with the delimiter stripped.
243
244
If count is different than zero, it will replace at most count
245
items.
246
"""
247
out = ""
248
249
cur_pos = 0
250
n = 0
251
252
for start, end, pos in self._search(regex, line):
253
out += line[cur_pos:start]
254
255
# Value, ignoring start/end delimiters
256
value = line[end:pos - 1]
257
258
# replaces \1 at the sub string, if \1 is used there
259
new_sub = sub
260
new_sub = new_sub.replace(r'\1', value)
261
262
out += new_sub
263
264
# Drop end ';' if any
265
if line[pos] == ';':
266
pos += 1
267
268
cur_pos = pos
269
n += 1
270
271
if count and count >= n:
272
break
273
274
# Append the remaining string
275
l = len(line)
276
out += line[cur_pos:l]
277
278
return out
279
280