|
18 | 18 | import argparse |
19 | 19 | import glob |
20 | 20 | import os |
| 21 | +import re |
21 | 22 |
|
22 | 23 | import tree_sitter_cpp as tscpp |
23 | 24 | from tree_sitter import Language, Parser |
|
26 | 27 | CPP_LANGUAGE = Language(tscpp.language()) |
27 | 28 |
|
28 | 29 |
|
| 30 | +# Macros that confuse tree-sitter's C++ parser. These are expanded in the |
| 31 | +# source text before parsing (but not in the output). Replacements are |
| 32 | +# same-length to preserve byte offsets between the parsed and original text. |
| 33 | +MACRO_EXPANSIONS = [ |
| 34 | + (re.compile(rb'\bCALLBACK\b'), b' '), |
| 35 | + (re.compile(rb'\bGCALL\b'), b' '), |
| 36 | + (re.compile(rb'\bWINAPI\b'), b' '), |
| 37 | + (re.compile(rb'\bIN\b'), b' '), |
| 38 | + (re.compile(rb'\bOUT\b'), b' '), |
| 39 | + (re.compile(rb'\bRO\b'), b' '), |
| 40 | + (re.compile(rb'\bW3DNEW\b'), b'new '), |
| 41 | + (re.compile(rb'\bNEW\b'), b'new'), |
| 42 | + (re.compile(rb'\b__RPC_FAR\b'), b' '), |
| 43 | + (re.compile(rb'\b__RPC_STUB\b'), b' '), |
| 44 | + (re.compile(rb'\b__asm\b'), b' '), |
| 45 | + (re.compile(rb'\b_asm\b'), b' '), |
| 46 | +] |
| 47 | + |
| 48 | + |
| 49 | +def preprocess_for_parsing(code_bytes): |
| 50 | + """Apply macro expansions to help tree-sitter parse the code.""" |
| 51 | + result = code_bytes |
| 52 | + for pattern, replacement in MACRO_EXPANSIONS: |
| 53 | + result = pattern.sub(replacement, result) |
| 54 | + return result |
| 55 | + |
| 56 | + |
29 | 57 | # Node types that create an indentation level for their children. |
30 | 58 | SCOPE_CREATING_TYPES = frozenset({ |
31 | 59 | 'compound_statement', |
@@ -116,7 +144,8 @@ def process_file(filepath, dry_run=False, verbose=False): |
116 | 144 | lines = lines[:-1] # split adds an empty string after trailing \n |
117 | 145 |
|
118 | 146 | code_bytes = content.encode('utf-8') |
119 | | - tree = parser.parse(code_bytes) |
| 147 | + parse_bytes = preprocess_for_parsing(code_bytes) |
| 148 | + tree = parser.parse(parse_bytes) |
120 | 149 |
|
121 | 150 | # If the file has excessive parse errors, the AST is unreliable - skip it. |
122 | 151 | # Only count top-level errors (not deeply nested ones from macro expansions). |
|
0 commit comments