CoCalc -- parse_data

GitHub Repository: torvalds/linux
Path: blob/master/tools/docs/lib/parse_data_structs.py
²⁹⁵²⁰ views
1
#!/usr/bin/env python3
2
# SPDX-License-Identifier: GPL-2.0
3
# Copyright (c) 2016-2025 by Mauro Carvalho Chehab <[email protected]>.
4
# pylint: disable=R0912,R0915
5

6
"""
7
Parse a source file or header, creating ReStructured Text cross references.
8

9
It accepts an optional file to change the default symbol reference or to
10
suppress symbols from the output.
11

12
It is capable of identifying defines, functions, structs, typedefs,
13
enums and enum symbols and create cross-references for all of them.
14
It is also capable of distinguish #define used for specifying a Linux
15
ioctl.
16

17
The optional rules file contains a set of rules like:
18

19
    ignore ioctl VIDIOC_ENUM_FMT
20
    replace ioctl VIDIOC_DQBUF vidioc_qbuf
21
    replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
22
"""
23

24
import os
25
import re
26
import sys
27

28

29
class ParseDataStructs:
30
    """
31
    Creates an enriched version of a Kernel header file with cross-links
32
    to each C data structure type.
33

34
    It is meant to allow having a more comprehensive documentation, where
35
    uAPI headers will create cross-reference links to the code.
36

37
    It is capable of identifying defines, functions, structs, typedefs,
38
    enums and enum symbols and create cross-references for all of them.
39
    It is also capable of distinguish #define used for specifying a Linux
40
    ioctl.
41

42
    By default, it create rules for all symbols and defines, but it also
43
    allows parsing an exception file. Such file contains a set of rules
44
    using the syntax below:
45

46
    1. Ignore rules:
47

48
        ignore <type> <symbol>`
49

50
    Removes the symbol from reference generation.
51

52
    2. Replace rules:
53

54
        replace <type> <old_symbol> <new_reference>
55

56
    Replaces how old_symbol with a new reference. The new_reference can be:
57
        - A simple symbol name;
58
        - A full Sphinx reference.
59

60
    On both cases, <type> can be:
61
        - ioctl: for defines that end with _IO*, e.g. ioctl definitions
62
        - define: for other defines
63
        - symbol: for symbols defined within enums;
64
        - typedef: for typedefs;
65
        - enum: for the name of a non-anonymous enum;
66
        - struct: for structs.
67

68
    Examples:
69

70
        ignore define __LINUX_MEDIA_H
71
        ignore ioctl VIDIOC_ENUM_FMT
72
        replace ioctl VIDIOC_DQBUF vidioc_qbuf
73
        replace define V4L2_EVENT_MD_FL_HAVE_FRAME_SEQ :c:type:`v4l2_event_motion_det`
74
    """
75

76
    # Parser regexes with multiple ways to capture enums and structs
77
    RE_ENUMS = [
78
        re.compile(r"^\s*enum\s+([\w_]+)\s*\{"),
79
        re.compile(r"^\s*enum\s+([\w_]+)\s*$"),
80
        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*\{"),
81
        re.compile(r"^\s*typedef\s*enum\s+([\w_]+)\s*$"),
82
    ]
83
    RE_STRUCTS = [
84
        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)\s*\{"),
85
        re.compile(r"^\s*struct\s+([_\w][\w\d_]+)$"),
86
        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)\s*\{"),
87
        re.compile(r"^\s*typedef\s*struct\s+([_\w][\w\d_]+)$"),
88
    ]
89

90
    # FIXME: the original code was written a long time before Sphinx C
91
    # domain to have multiple namespaces. To avoid to much turn at the
92
    # existing hyperlinks, the code kept using "c:type" instead of the
93
    # right types. To change that, we need to change the types not only
94
    # here, but also at the uAPI media documentation.
95
    DEF_SYMBOL_TYPES = {
96
        "ioctl": {
97
            "prefix": "\\ ",
98
            "suffix": "\\ ",
99
            "ref_type": ":ref",
100
            "description": "IOCTL Commands",
101
        },
102
        "define": {
103
            "prefix": "\\ ",
104
            "suffix": "\\ ",
105
            "ref_type": ":ref",
106
            "description": "Macros and Definitions",
107
        },
108
        # We're calling each definition inside an enum as "symbol"
109
        "symbol": {
110
            "prefix": "\\ ",
111
            "suffix": "\\ ",
112
            "ref_type": ":ref",
113
            "description": "Enumeration values",
114
        },
115
        "typedef": {
116
            "prefix": "\\ ",
117
            "suffix": "\\ ",
118
            "ref_type": ":c:type",
119
            "description": "Type Definitions",
120
        },
121
        # This is the description of the enum itself
122
        "enum": {
123
            "prefix": "\\ ",
124
            "suffix": "\\ ",
125
            "ref_type": ":c:type",
126
            "description": "Enumerations",
127
        },
128
        "struct": {
129
            "prefix": "\\ ",
130
            "suffix": "\\ ",
131
            "ref_type": ":c:type",
132
            "description": "Structures",
133
        },
134
    }
135

136
    def __init__(self, debug: bool = False):
137
        """Initialize internal vars"""
138
        self.debug = debug
139
        self.data = ""
140

141
        self.symbols = {}
142

143
        for symbol_type in self.DEF_SYMBOL_TYPES:
144
            self.symbols[symbol_type] = {}
145

146
    def store_type(self, symbol_type: str, symbol: str,
147
                   ref_name: str = None, replace_underscores: bool = True):
148
        """
149
        Stores a new symbol at self.symbols under symbol_type.
150

151
        By default, underscores are replaced by "-"
152
        """
153
        defs = self.DEF_SYMBOL_TYPES[symbol_type]
154

155
        prefix = defs.get("prefix", "")
156
        suffix = defs.get("suffix", "")
157
        ref_type = defs.get("ref_type")
158

159
        # Determine ref_link based on symbol type
160
        if ref_type:
161
            if symbol_type == "enum":
162
                ref_link = f"{ref_type}:`{symbol}`"
163
            else:
164
                if not ref_name:
165
                    ref_name = symbol.lower()
166

167
                # c-type references don't support hash
168
                if ref_type == ":ref" and replace_underscores:
169
                    ref_name = ref_name.replace("_", "-")
170

171
                ref_link = f"{ref_type}:`{symbol} <{ref_name}>`"
172
        else:
173
            ref_link = symbol
174

175
        self.symbols[symbol_type][symbol] = f"{prefix}{ref_link}{suffix}"
176

177
    def store_line(self, line):
178
        """Stores a line at self.data, properly indented"""
179
        line = "    " + line.expandtabs()
180
        self.data += line.rstrip(" ")
181

182
    def parse_file(self, file_in: str):
183
        """Reads a C source file and get identifiers"""
184
        self.data = ""
185
        is_enum = False
186
        is_comment = False
187
        multiline = ""
188

189
        with open(file_in, "r",
190
                  encoding="utf-8", errors="backslashreplace") as f:
191
            for line_no, line in enumerate(f):
192
                self.store_line(line)
193
                line = line.strip("\n")
194

195
                # Handle continuation lines
196
                if line.endswith(r"\\"):
197
                    multiline += line[-1]
198
                    continue
199

200
                if multiline:
201
                    line = multiline + line
202
                    multiline = ""
203

204
                # Handle comments. They can be multilined
205
                if not is_comment:
206
                    if re.search(r"/\*.*", line):
207
                        is_comment = True
208
                    else:
209
                        # Strip C99-style comments
210
                        line = re.sub(r"(//.*)", "", line)
211

212
                if is_comment:
213
                    if re.search(r".*\*/", line):
214
                        is_comment = False
215
                    else:
216
                        multiline = line
217
                        continue
218

219
                # At this point, line variable may be a multilined statement,
220
                # if lines end with \ or if they have multi-line comments
221
                # With that, it can safely remove the entire comments,
222
                # and there's no need to use re.DOTALL for the logic below
223

224
                line = re.sub(r"(/\*.*\*/)", "", line)
225
                if not line.strip():
226
                    continue
227

228
                # It can be useful for debug purposes to print the file after
229
                # having comments stripped and multi-lines grouped.
230
                if self.debug > 1:
231
                    print(f"line {line_no + 1}: {line}")
232

233
                # Now the fun begins: parse each type and store it.
234

235
                # We opted for a two parsing logic here due to:
236
                # 1. it makes easier to debug issues not-parsed symbols;
237
                # 2. we want symbol replacement at the entire content, not
238
                #    just when the symbol is detected.
239

240
                if is_enum:
241
                    match = re.match(r"^\s*([_\w][\w\d_]+)\s*[\,=]?", line)
242
                    if match:
243
                        self.store_type("symbol", match.group(1))
244
                    if "}" in line:
245
                        is_enum = False
246
                    continue
247

248
                match = re.match(r"^\s*#\s*define\s+([\w_]+)\s+_IO", line)
249
                if match:
250
                    self.store_type("ioctl", match.group(1),
251
                                    replace_underscores=False)
252
                    continue
253

254
                match = re.match(r"^\s*#\s*define\s+([\w_]+)(\s+|$)", line)
255
                if match:
256
                    self.store_type("define", match.group(1))
257
                    continue
258

259
                match = re.match(r"^\s*typedef\s+([_\w][\w\d_]+)\s+(.*)\s+([_\w][\w\d_]+);",
260
                                 line)
261
                if match:
262
                    name = match.group(2).strip()
263
                    symbol = match.group(3)
264
                    self.store_type("typedef", symbol, ref_name=name)
265
                    continue
266

267
                for re_enum in self.RE_ENUMS:
268
                    match = re_enum.match(line)
269
                    if match:
270
                        self.store_type("enum", match.group(1))
271
                        is_enum = True
272
                        break
273

274
                for re_struct in self.RE_STRUCTS:
275
                    match = re_struct.match(line)
276
                    if match:
277
                        self.store_type("struct", match.group(1))
278
                        break
279

280
    def process_exceptions(self, fname: str):
281
        """
282
        Process exceptions file with rules to ignore or replace references.
283
        """
284
        if not fname:
285
            return
286

287
        name = os.path.basename(fname)
288

289
        with open(fname, "r", encoding="utf-8", errors="backslashreplace") as f:
290
            for ln, line in enumerate(f):
291
                ln += 1
292
                line = line.strip()
293
                if not line or line.startswith("#"):
294
                    continue
295

296
                # Handle ignore rules
297
                match = re.match(r"^ignore\s+(\w+)\s+(\S+)", line)
298
                if match:
299
                    c_type = match.group(1)
300
                    symbol = match.group(2)
301

302
                    if c_type not in self.DEF_SYMBOL_TYPES:
303
                        sys.exit(f"{name}:{ln}: {c_type} is invalid")
304

305
                    d = self.symbols[c_type]
306
                    if symbol in d:
307
                        del d[symbol]
308

309
                    continue
310

311
                # Handle replace rules
312
                match = re.match(r"^replace\s+(\S+)\s+(\S+)\s+(\S+)", line)
313
                if not match:
314
                    sys.exit(f"{name}:{ln}: invalid line: {line}")
315

316
                c_type, old, new = match.groups()
317

318
                if c_type not in self.DEF_SYMBOL_TYPES:
319
                    sys.exit(f"{name}:{ln}: {c_type} is invalid")
320

321
                reftype = None
322

323
                # Parse reference type when the type is specified
324

325
                match = re.match(r"^\:c\:(data|func|macro|type)\:\`(.+)\`", new)
326
                if match:
327
                    reftype = f":c:{match.group(1)}"
328
                    new = match.group(2)
329
                else:
330
                    match = re.search(r"(\:ref)\:\`(.+)\`", new)
331
                    if match:
332
                        reftype = match.group(1)
333
                        new = match.group(2)
334

335
                # If the replacement rule doesn't have a type, get default
336
                if not reftype:
337
                    reftype = self.DEF_SYMBOL_TYPES[c_type].get("ref_type")
338
                    if not reftype:
339
                        reftype = self.DEF_SYMBOL_TYPES[c_type].get("real_type")
340

341
                new_ref = f"{reftype}:`{old} <{new}>`"
342

343
                # Change self.symbols to use the replacement rule
344
                if old in self.symbols[c_type]:
345
                    self.symbols[c_type][old] = new_ref
346
                else:
347
                    print(f"{name}:{ln}: Warning: can't find {old} {c_type}")
348

349
    def debug_print(self):
350
        """
351
        Print debug information containing the replacement rules per symbol.
352
        To make easier to check, group them per type.
353
        """
354
        if not self.debug:
355
            return
356

357
        for c_type, refs in self.symbols.items():
358
            if not refs:  # Skip empty dictionaries
359
                continue
360

361
            print(f"{c_type}:")
362

363
            for symbol, ref in sorted(refs.items()):
364
                print(f"  {symbol} -> {ref}")
365

366
            print()
367

368
    def gen_output(self):
369
        """Write the formatted output to a file."""
370

371
        # Avoid extra blank lines
372
        text = re.sub(r"\s+$", "", self.data) + "\n"
373
        text = re.sub(r"\n\s+\n", "\n\n", text)
374

375
        # Escape Sphinx special characters
376
        text = re.sub(r"([\_\`\*\<\>\&\\\\:\/\|\%\$\#\{\}\~\^])", r"\\\1", text)
377

378
        # Source uAPI files may have special notes. Use bold font for them
379
        text = re.sub(r"DEPRECATED", "**DEPRECATED**", text)
380

381
        # Delimiters to catch the entire symbol after escaped
382
        start_delim = r"([ \n\t\(=\*\@])"
383
        end_delim = r"(\s|,|\\=|\\:|\;|\)|\}|\{)"
384

385
        # Process all reference types
386
        for ref_dict in self.symbols.values():
387
            for symbol, replacement in ref_dict.items():
388
                symbol = re.escape(re.sub(r"([\_\`\*\<\>\&\\\\:\/])", r"\\\1", symbol))
389
                text = re.sub(fr'{start_delim}{symbol}{end_delim}',
390
                              fr'\1{replacement}\2', text)
391

392
        # Remove "\ " where not needed: before spaces and at the end of lines
393
        text = re.sub(r"\\ ([\n ])", r"\1", text)
394
        text = re.sub(r" \\ ", " ", text)
395

396
        return text
397

398
    def gen_toc(self):
399
        """
400
        Create a TOC table pointing to each symbol from the header
401
        """
402
        text = []
403

404
        # Add header
405
        text.append(".. contents:: Table of Contents")
406
        text.append("   :depth: 2")
407
        text.append("   :local:")
408
        text.append("")
409

410
        # Sort symbol types per description
411
        symbol_descriptions = []
412
        for k, v in self.DEF_SYMBOL_TYPES.items():
413
            symbol_descriptions.append((v['description'], k))
414

415
        symbol_descriptions.sort()
416

417
        # Process each category
418
        for description, c_type in symbol_descriptions:
419

420
            refs = self.symbols[c_type]
421
            if not refs:  # Skip empty categories
422
                continue
423

424
            text.append(f"{description}")
425
            text.append("-" * len(description))
426
            text.append("")
427

428
            # Sort symbols alphabetically
429
            for symbol, ref in sorted(refs.items()):
430
                text.append(f"* :{ref}:")
431

432
            text.append("")  # Add empty line between categories
433

434
        return "\n".join(text)
435

436
    def write_output(self, file_in: str, file_out: str, toc: bool):
437
        title = os.path.basename(file_in)
438

439
        if toc:
440
            text = self.gen_toc()
441
        else:
442
            text = self.gen_output()
443

444
        with open(file_out, "w", encoding="utf-8", errors="backslashreplace") as f:
445
            f.write(".. -*- coding: utf-8; mode: rst -*-\n\n")
446
            f.write(f"{title}\n")
447
            f.write("=" * len(title) + "\n\n")
448

449
            if not toc:
450
                f.write(".. parsed-literal::\n\n")
451

452
            f.write(text)
453

454
Product

Resources

Company