CoCalc -- parse_data

GitHub Repository: torvalds/linux
Path: blob/master/tools/lib/python/kdoc/parse_data_structs.py
¹²²⁹⁴¹ views
1
#!/usr/bin/env python3
2
# SPDX-License-Identifier: GPL-2.0
3
# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <[email protected]>.
4
# pylint: disable=R0912,R0915
5

6
"""
7
Parse a source file or header, creating ReStructured Text cross references.
8

9
It accepts an optional file to change the default symbol reference or to
10
suppress symbols from the output.
11

12
It is capable of identifying ``define``, function, ``struct``, ``typedef``,
13
``enum`` and ``enum`` symbols and create cross-references for all of them.
14
It is also capable of distinguish #define used for specifying a Linux
15
ioctl.
16

17
The optional rules file contains a set of rules like::
18

19
    ignore ioctl VIDIOC_ENUM_FMT
20
    replace ioctl VIDIOC_DQBUF vidioc_qbuf
21
    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
22
"""
23

24
import os
25
import re
26
import sys
27

28

29
class ParseDataStructs:
30
    """
31
    Creates an enriched version of a Kernel header file with cross-links
32
    to each C data structure type.
33

34
    It is meant to allow having a more comprehensive documentation, where
35
    uAPI headers will create cross-reference links to the code.
36

37
    It is capable of identifying ``define``, function, ``struct``, ``typedef``,
38
    ``enum`` and ``enum`` symbols and create cross-references for all of them.
39
    It is also capable of distinguish #define used for specifying a Linux
40
    ioctl.
41

42
    By default, it create rules for all symbols and defines, but it also
43
    allows parsing an exception file. Such file contains a set of rules
44
    using the syntax below:
45

46
    1. Ignore rules::
47

48
        ignore <type> <symbol>`
49

50
    Removes the symbol from reference generation.
51

52
    2. Replace rules::
53

54
        replace <type> <old_symbol> <new_reference>
55

56
       Replaces how old_symbol with a new reference. The new_reference can be:
57

58
        - A simple symbol name;
59
        - A full Sphinx reference.
60

61
    3. Namespace rules::
62

63
        namespace <namespace>
64

65
       Sets C namespace to be used during cross-reference generation. Can
66
       be overridden by replace rules.
67

68
    On ignore and replace rules, ``<type>`` can be:
69
        - ``ioctl``: for defines that end with ``_IO*``, e.g. ioctl definitions
70
        - ``define``: for other defines
71
        - ``symbol``: for symbols defined within enums;
72
        - ``typedef``: for typedefs;
73
        - ``enum``: for the name of a non-anonymous enum;
74
        - ``struct``: for structs.
75

76
    Examples::
77

78
        ignore define __LINUX_MEDIA_H
79
        ignore ioctl VIDIOC_ENUM_FMT
80
        replace ioctl VIDIOC_DQBUF vidioc_qbuf
81
        replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
82

83
        namespace MC
84
    """
85

86
    #: Parser regex with multiple ways to capture enums.
87
    RE_ENUMS = [
88
        re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
89
        re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
90
        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
91
        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
92
    ]
93

94
    #: Parser regex with multiple ways to capture structs.
95
    RE_STRUCTS = [
96
        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
97
        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
98
        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
99
        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
100
    ]
101

102
    # NOTE: the original code was written a long time before Sphinx C
103
    # domain to have multiple namespaces. To avoid to much turn at the
104
    # existing hyperlinks, the code kept using "c:type" instead of the
105
    # right types. To change that, we need to change the types not only
106
    # here, but also at the uAPI media documentation.
107

108
    #: Dictionary containing C type identifiers to be transformed.
109
    DEF_SYMBOL_TYPES = {
110
        "ioctl": {
111
            "prefix": "\\ ",
112
            "suffix": "\\ ",
113
            "ref_type": ":ref",
114
            "description": "IOCTL Commands",
115
        },
116
        "define": {
117
            "prefix": "\\ ",
118
            "suffix": "\\ ",
119
            "ref_type": ":ref",
120
            "description": "Macros and Definitions",
121
        },
122
        # We're calling each definition inside an enum as "symbol"
123
        "symbol": {
124
            "prefix": "\\ ",
125
            "suffix": "\\ ",
126
            "ref_type": ":ref",
127
            "description": "Enumeration values",
128
        },
129
        "typedef": {
130
            "prefix": "\\ ",
131
            "suffix": "\\ ",
132
            "ref_type": ":c:type",
133
            "description": "Type Definitions",
134
        },
135
        # This is the description of the enum itself
136
        "enum": {
137
            "prefix": "\\ ",
138
            "suffix": "\\ ",
139
            "ref_type": ":c:type",
140
            "description": "Enumerations",
141
        },
142
        "struct": {
143
            "prefix": "\\ ",
144
            "suffix": "\\ ",
145
            "ref_type": ":c:type",
146
            "description": "Structures",
147
        },
148
    }
149

150
    def __init__(self, debug: bool = False):
151
        """Initialize internal vars"""
152
        self.debug = debug
153
        self.data = ""
154

155
        self.symbols = {}
156

157
        self.namespace = None
158
        self.ignore = []
159
        self.replace = []
160

161
        for symbol_type in self.DEF_SYMBOL_TYPES:
162
            self.symbols[symbol_type] = {}
163

164
    def read_exceptions(self, fname: str):
165
        """
166
        Read an optional exceptions file, used to override defaults.
167
        """
168

169
        if not fname:
170
            return
171

172
        name = os.path.basename(fname)
173

174
        with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
175
            for ln, line in enumerate(f):
176
                ln += 1
177
                line = line.strip()
178
                if not line or line.startswith("#"):
179
                    continue
180

181
                # ignore rules
182
                match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
183

184
                if match:
185
                    self.ignore.append((ln, match.group(1), match.group(2)))
186
                    continue
187

188
                # replace rules
189
                match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
190
                if match:
191
                    self.replace.append((ln, match.group(1), match.group(2),
192
                                         match.group(3)))
193
                    continue
194

195
                match = re.match(r"^namespace\s+(\S+)", line)
196
                if match:
197
                    self.namespace = match.group(1)
198
                    continue
199

200
                sys.exit(f"{name}:{ln}: invalid line: {line}")
201

202
    def apply_exceptions(self):
203
        """
204
        Process exceptions file with rules to ignore or replace references.
205
        """
206

207
        # Handle ignore rules
208
        for ln, c_type, symbol in self.ignore:
209
            if c_type not in self.DEF_SYMBOL_TYPES:
210
                sys.exit(f"{name}:{ln}: {c_type} is invalid")
211

212
            d = self.symbols[c_type]
213
            if symbol in d:
214
                del d[symbol]
215

216
        # Handle replace rules
217
        for ln, c_type, old, new in self.replace:
218
            if c_type not in self.DEF_SYMBOL_TYPES:
219
                sys.exit(f"{name}:{ln}: {c_type} is invalid")
220

221
            reftype = None
222

223
            # Parse reference type when the type is specified
224

225
            match = re.match(r"^\:c\:(\w+)\:\`(.+)\`", new)
226
            if match:
227
                reftype = f":c:{match.group(1)}"
228
                new = match.group(2)
229
            else:
230
                match = re.search(r"(\:ref)\:\`(.+)\`", new)
231
                if match:
232
                    reftype = match.group(1)
233
                    new = match.group(2)
234

235
            # If the replacement rule doesn't have a type, get default
236
            if not reftype:
237
                reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
238
                if not reftype:
239
                    reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
240

241
            new_ref = f"{reftype}:`{old} <{new}>`"
242

243
            # Change self.symbols to use the replacement rule
244
            if old in self.symbols[c_type]:
245
                (_, ln) = self.symbols[c_type][old]
246
                self.symbols[c_type][old] = (new_ref, ln)
247
            else:
248
                print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
249

250
    def store_type(self, ln, symbol_type: str, symbol: str,
251
                   ref_name: str = None, replace_underscores: bool = True):
252
        """
253
        Store a new symbol at self.symbols under symbol_type.
254

255
        By default, underscores are replaced by ``-``.
256
        """
257
        defs = self.DEF_SYMBOL_TYPES[symbol_type]
258

259
        prefix = defs.get("prefix", "")
260
        suffix = defs.get("suffix", "")
261
        ref_type = defs.get("ref_type")
262

263
        # Determine ref_link based on symbol type
264
        if ref_type or self.namespace:
265
            if not ref_name:
266
                ref_name = symbol.lower()
267

268
            # c-type references don't support hash
269
            if ref_type == ":ref" and replace_underscores:
270
                ref_name = ref_name.replace("_", "-")
271

272
            # C domain references may have namespaces
273
            if ref_type.startswith(":c:"):
274
                if self.namespace:
275
                    ref_name = f"{self.namespace}.{ref_name}"
276

277
            if ref_type:
278
                ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
279
            else:
280
                ref_link = f"`{symbol} <{ref_name}>`"
281
        else:
282
            ref_link = symbol
283

284
        self.symbols[symbol_type][symbol] = (f"{prefix}{ref_link}{suffix}", ln)
285

286
    def store_line(self, line):
287
        """
288
        Store a line at self.data, properly indented.
289
        """
290
        line = "    " + line.expandtabs()
291
        self.data += line.rstrip(" ")
292

293
    def parse_file(self, file_in: str, exceptions: str = None):
294
        """
295
        Read a C source file and get identifiers.
296
        """
297
        self.data = ""
298
        is_enum = False
299
        is_comment = False
300
        multiline = ""
301

302
        self.read_exceptions(exceptions)
303

304
        with open(file_in, "r",
305
                  encoding="utf-8", errors="backslashreplace") as f:
306
            for line_no, line in enumerate(f):
307
                self.store_line(line)
308
                line = line.strip("\n")
309

310
                # Handle continuation lines
311
                if line.endswith(r"\\"):
312
                    multiline += line[-1]
313
                    continue
314

315
                if multiline:
316
                    line = multiline + line
317
                    multiline = ""
318

319
                # Handle comments. They can be multilined
320
                if not is_comment:
321
                    if re.search(r"/\*.*", line):
322
                        is_comment = True
323
                    else:
324
                        # Strip C99-style comments
325
                        line = re.sub(r"(//.*)", "", line)
326

327
                if is_comment:
328
                    if re.search(r".*\*/", line):
329
                        is_comment = False
330
                    else:
331
                        multiline = line
332
                        continue
333

334
                # At this point, line variable may be a multilined statement,
335
                # if lines end with \ or if they have multi-line comments
336
                # With that, it can safely remove the entire comments,
337
                # and there's no need to use re.DOTALL for the logic below
338

339
                line = re.sub(r"(/\*.*\*/)", "", line)
340
                if not line.strip():
341
                    continue
342

343
                # It can be useful for debug purposes to print the file after
344
                # having comments stripped and multi-lines grouped.
345
                if self.debug > 1:
346
                    print(f"line {line_no + 1}: {line}")
347

348
                # Now the fun begins: parse each type and store it.
349

350
                # We opted for a two parsing logic here due to:
351
                # 1. it makes easier to debug issues not-parsed symbols;
352
                # 2. we want symbol replacement at the entire content, not
353
                #    just when the symbol is detected.
354

355
                if is_enum:
356
                    match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
357
                    if match:
358
                        self.store_type(line_no, "symbol", match.group(1))
359
                    if "}" in line:
360
                        is_enum = False
361
                    continue
362

363
                match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
364
                if match:
365
                    self.store_type(line_no, "ioctl", match.group(1),
366
                                    replace_underscores=False)
367
                    continue
368

369
                match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
370
                if match:
371
                    self.store_type(line_no, "define", match.group(1))
372
                    continue
373

374
                match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
375
                                 line)
376
                if match:
377
                    name = match.group(2).strip()
378
                    symbol = match.group(3)
379
                    self.store_type(line_no, "typedef", symbol, ref_name=name)
380
                    continue
381

382
                for re_enum in self.RE_ENUMS:
383
                    match = re_enum.match(line)
384
                    if match:
385
                        self.store_type(line_no, "enum", match.group(1))
386
                        is_enum = True
387
                        break
388

389
                for re_struct in self.RE_STRUCTS:
390
                    match = re_struct.match(line)
391
                    if match:
392
                        self.store_type(line_no, "struct", match.group(1))
393
                        break
394

395
        self.apply_exceptions()
396

397
    def debug_print(self):
398
        """
399
        Print debug information containing the replacement rules per symbol.
400
        To make easier to check, group them per type.
401
        """
402
        if not self.debug:
403
            return
404

405
        for c_type, refs in self.symbols.items():
406
            if not refs:  # Skip empty dictionaries
407
                continue
408

409
            print(f"{c_type}:")
410

411
            for symbol, (ref, ln) in sorted(refs.items()):
412
                print(f"  #{ln:<5d} {symbol} -> {ref}")
413

414
            print()
415

416
    def gen_output(self):
417
        """Write the formatted output to a file."""
418

419
        # Avoid extra blank lines
420
        text = re.sub(r"\s+$", "", self.data) + "\n"
421
        text = re.sub(r"\n\s+\n", "\n\n", text)
422

423
        # Escape Sphinx special characters
424
        text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
425

426
        # Source uAPI files may have special notes. Use bold font for them
427
        text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
428

429
        # Delimiters to catch the entire symbol after escaped
430
        start_delim = r"([ \n\t\(=\*\@])"
431
        end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
432

433
        # Process all reference types
434
        for ref_dict in self.symbols.values():
435
            for symbol, (replacement, _) in ref_dict.items():
436
                symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
437
                text = re.sub(fr'{start_delim}{symbol}{end_delim}',
438
                              fr'\1{replacement}\2', text)
439

440
        # Remove "\ " where not needed: before spaces and at the end of lines
441
        text = re.sub(r"\\ ([\n ])", r"\1", text)
442
        text = re.sub(r" \\ ", " ", text)
443

444
        return text
445

446
    def gen_toc(self):
447
        """
448
        Create a list of symbols to be part of a TOC contents table.
449
        """
450
        text = []
451

452
        # Sort symbol types per description
453
        symbol_descriptions = []
454
        for k, v in self.DEF_SYMBOL_TYPES.items():
455
            symbol_descriptions.append((v['description'], k))
456

457
        symbol_descriptions.sort()
458

459
        # Process each category
460
        for description, c_type in symbol_descriptions:
461

462
            refs = self.symbols[c_type]
463
            if not refs:  # Skip empty categories
464
                continue
465

466
            text.append(f"{description}")
467
            text.append("-" * len(description))
468
            text.append("")
469

470
            # Sort symbols alphabetically
471
            for symbol, (ref, ln) in sorted(refs.items()):
472
                text.append(f"- LINENO_{ln}: {ref}")
473

474
            text.append("")  # Add empty line between categories
475

476
        return "\n".join(text)
477

478
    def write_output(self, file_in: str, file_out: str, toc: bool):
479
        """
480
        Write a ReST output file.
481
        """
482

483
        title = os.path.basename(file_in)
484

485
        if toc:
486
            text = self.gen_toc()
487
        else:
488
            text = self.gen_output()
489

490
        with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
491
            f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
492
            f.write(f"{title}\n")
493
            f.write("=" * len(title) + "\n\n")
494

495
            if not toc:
496
                f.write(".. parsed-literal::\n\n")
497

498
            f.write(text)
499

500
Product

Resources

Company