|
| 1 | +""" |
| 2 | +Convert leading spaces to tabs using tree-sitter for accurate C++ parsing. |
| 3 | +
|
| 4 | +Uses the tree-sitter C++ parser to determine the correct indentation depth |
| 5 | +of each line, then replaces leading whitespace with the correct number of tabs. |
| 6 | +
|
| 7 | +Only converts lines where the tool is confident about the correct depth. |
| 8 | +Lines inside block comments, continuation lines, and ambiguous cases are |
| 9 | +skipped and logged. |
| 10 | +
|
| 11 | +Requirements: |
| 12 | + pip install tree-sitter tree-sitter-cpp |
| 13 | +
|
| 14 | +Usage: |
| 15 | + python convert_leading_spaces_to_tabs.py [--dry-run] [--verbose] |
| 16 | +""" |
| 17 | + |
| 18 | +import argparse |
| 19 | +import glob |
| 20 | +import os |
| 21 | + |
| 22 | +import tree_sitter_cpp as tscpp |
| 23 | +from tree_sitter import Language, Parser |
| 24 | + |
| 25 | + |
| 26 | +CPP_LANGUAGE = Language(tscpp.language()) |
| 27 | + |
| 28 | + |
| 29 | +# Node types that create an indentation level for their children. |
| 30 | +SCOPE_CREATING_TYPES = frozenset({ |
| 31 | + 'compound_statement', |
| 32 | + 'field_declaration_list', |
| 33 | + 'declaration_list', # namespace body |
| 34 | + 'enumerator_list', |
| 35 | + 'initializer_list', |
| 36 | + 'case_statement', |
| 37 | +}) |
| 38 | + |
| 39 | + |
| 40 | +def get_indent_depth(node): |
| 41 | + """Walk up the AST and count scope-creating ancestors to determine indent depth.""" |
| 42 | + depth = 0 |
| 43 | + current = node.parent |
| 44 | + |
| 45 | + while current is not None: |
| 46 | + if current.type in SCOPE_CREATING_TYPES: |
| 47 | + # Special case: a compound_statement that is a direct child of |
| 48 | + # case_statement does not add an extra indent level. The case_statement |
| 49 | + # already provides the indent, and the braces are at the case level. |
| 50 | + if (current.type == 'compound_statement' |
| 51 | + and current.parent is not None |
| 52 | + and current.parent.type == 'case_statement'): |
| 53 | + pass # don't increment |
| 54 | + else: |
| 55 | + depth += 1 |
| 56 | + current = current.parent |
| 57 | + |
| 58 | + return depth |
| 59 | + |
| 60 | + |
| 61 | +def is_continuation_line(node, line_num): |
| 62 | + """Check if this line is a continuation of a multi-line statement. |
| 63 | +
|
| 64 | + A continuation line is one where the innermost meaningful AST node |
| 65 | + started on a previous line. We skip these because their alignment |
| 66 | + is a style choice we don't want to change in this PR. |
| 67 | + """ |
| 68 | + if node is None: |
| 69 | + return False |
| 70 | + |
| 71 | + # Walk up to find the nearest statement-level node |
| 72 | + current = node |
| 73 | + while current is not None: |
| 74 | + # If this node starts on our line, it's not a continuation |
| 75 | + if current.start_point[0] == line_num: |
| 76 | + current = current.parent |
| 77 | + continue |
| 78 | + |
| 79 | + # If the node started on a previous line and is a statement-level |
| 80 | + # construct, this is a continuation line |
| 81 | + if current.start_point[0] < line_num: |
| 82 | + if current.type in ( |
| 83 | + 'declaration', 'field_declaration', 'expression_statement', |
| 84 | + 'return_statement', 'init_declarator', 'argument_list', |
| 85 | + 'parameter_list', 'condition_clause', 'call_expression', |
| 86 | + 'binary_expression', 'assignment_expression', |
| 87 | + 'function_declarator', 'template_argument_list', |
| 88 | + 'field_initializer_list', 'field_initializer', |
| 89 | + 'base_class_clause', 'initializer_pair', |
| 90 | + ): |
| 91 | + return True |
| 92 | + current = current.parent |
| 93 | + |
| 94 | + return False |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | + |
| 99 | + |
| 100 | +def process_file(filepath, dry_run=False, verbose=False): |
| 101 | + """Process a single file, converting leading spaces to tabs. |
| 102 | +
|
| 103 | + Returns (changed, skipped, total_lines) tuple. |
| 104 | + """ |
| 105 | + parser = Parser(CPP_LANGUAGE) |
| 106 | + |
| 107 | + try: |
| 108 | + with open(filepath, 'r', encoding='cp1252') as f: |
| 109 | + content = f.read() |
| 110 | + except (UnicodeDecodeError, OSError): |
| 111 | + return 0, 0, 0 |
| 112 | + |
| 113 | + lines = content.split('\n') |
| 114 | + # Preserve original line endings |
| 115 | + if content.endswith('\n'): |
| 116 | + lines = lines[:-1] # split adds an empty string after trailing \n |
| 117 | + |
| 118 | + code_bytes = content.encode('utf-8') |
| 119 | + tree = parser.parse(code_bytes) |
| 120 | + |
| 121 | + # If the file has excessive parse errors, the AST is unreliable - skip it. |
| 122 | + # Only count top-level errors (not deeply nested ones from macro expansions). |
| 123 | + error_count = sum(1 for child in tree.root_node.children if child.type == 'ERROR') |
| 124 | + if error_count > 10: |
| 125 | + if verbose: |
| 126 | + print(f" SKIP file (too many top-level parse errors: {error_count})") |
| 127 | + return 0, 0, len(lines) |
| 128 | + |
| 129 | + new_lines = [] |
| 130 | + changed = 0 |
| 131 | + skipped = 0 |
| 132 | + in_block_comment = False |
| 133 | + in_macro_continuation = False |
| 134 | + |
| 135 | + for line_idx, line in enumerate(lines): |
| 136 | + # Track block comments manually for reliability |
| 137 | + stripped = line.lstrip() |
| 138 | + line_continues_macro = line.rstrip().endswith('\\') |
| 139 | + |
| 140 | + if in_block_comment: |
| 141 | + # Inside a block comment - don't touch |
| 142 | + new_lines.append(line) |
| 143 | + if '*/' in line: |
| 144 | + in_block_comment = False |
| 145 | + continue |
| 146 | + |
| 147 | + if stripped.startswith('/*'): |
| 148 | + if '*/' not in stripped or stripped.index('*/') < stripped.index('/*') + 2: |
| 149 | + in_block_comment = '*/' not in stripped[2:] |
| 150 | + new_lines.append(line) |
| 151 | + continue |
| 152 | + |
| 153 | + # Multi-line macro continuations - don't touch |
| 154 | + if in_macro_continuation: |
| 155 | + new_lines.append(line) |
| 156 | + in_macro_continuation = line_continues_macro |
| 157 | + continue |
| 158 | + |
| 159 | + # Blank lines - preserve as-is |
| 160 | + if stripped == '': |
| 161 | + new_lines.append(line) |
| 162 | + continue |
| 163 | + |
| 164 | + # Preprocessor directives - always column 0 |
| 165 | + if stripped.startswith('#'): |
| 166 | + new_lines.append(line) |
| 167 | + in_macro_continuation = line_continues_macro |
| 168 | + continue |
| 169 | + |
| 170 | + # Check if the line already uses tabs (no leading spaces) |
| 171 | + leading_ws = line[:len(line) - len(line.lstrip())] |
| 172 | + if ' ' not in leading_ws: |
| 173 | + # Already all tabs (or no indentation) - keep as-is |
| 174 | + new_lines.append(line) |
| 175 | + continue |
| 176 | + |
| 177 | + # Skip mixed tab+space lines (e.g. tab then spaces for alignment) |
| 178 | + # These need careful handling deferred to a later PR |
| 179 | + if '\t' in leading_ws and ' ' in leading_ws: |
| 180 | + new_lines.append(line) |
| 181 | + skipped += 1 |
| 182 | + if verbose: |
| 183 | + print(f" SKIP mixed L{line_idx+1}: {line.rstrip()[:80]}") |
| 184 | + continue |
| 185 | + |
| 186 | + # This line has leading spaces. Use tree-sitter to determine depth. |
| 187 | + # Find the AST node at the first non-whitespace character. |
| 188 | + col = len(leading_ws) |
| 189 | + node = tree.root_node.descendant_for_point_range( |
| 190 | + (line_idx, col), (line_idx, col) |
| 191 | + ) |
| 192 | + |
| 193 | + if node is None: |
| 194 | + new_lines.append(line) |
| 195 | + skipped += 1 |
| 196 | + continue |
| 197 | + |
| 198 | + # Skip if the node or any ancestor is an ERROR node (parse failure) |
| 199 | + error_ancestor = False |
| 200 | + check = node |
| 201 | + while check is not None: |
| 202 | + if check.type == 'ERROR': |
| 203 | + error_ancestor = True |
| 204 | + break |
| 205 | + check = check.parent |
| 206 | + if error_ancestor: |
| 207 | + new_lines.append(line) |
| 208 | + skipped += 1 |
| 209 | + if verbose: |
| 210 | + print(f" SKIP parse error L{line_idx+1}: {line.rstrip()[:80]}") |
| 211 | + continue |
| 212 | + |
| 213 | + # For comment nodes, we can reindent // line comments but |
| 214 | + # should skip /* */ block comments (their internal formatting |
| 215 | + # is intentional). Block comments are already handled by the |
| 216 | + # in_block_comment tracking above, so single-line comments |
| 217 | + # that tree-sitter reports as 'comment' are fine to reindent. |
| 218 | + # However, single-line /* */ comments on their own line are also |
| 219 | + # fine to reindent. |
| 220 | + |
| 221 | + # Skip continuation lines |
| 222 | + if is_continuation_line(node, line_idx): |
| 223 | + new_lines.append(line) |
| 224 | + skipped += 1 |
| 225 | + if verbose: |
| 226 | + print(f" SKIP continuation L{line_idx+1}: {line.rstrip()[:80]}") |
| 227 | + continue |
| 228 | + |
| 229 | + # Calculate expected depth |
| 230 | + depth = get_indent_depth(node) |
| 231 | + |
| 232 | + # Special handling: the opening/closing braces of a compound_statement |
| 233 | + # should be at the parent's depth, not the compound_statement's depth. |
| 234 | + if node.type in ('{', '}'): |
| 235 | + parent = node.parent |
| 236 | + if parent is not None and parent.type in SCOPE_CREATING_TYPES: |
| 237 | + # For a compound_statement inside case_statement, braces should |
| 238 | + # be at the case label level (case_statement's own depth) |
| 239 | + if (parent.type == 'compound_statement' |
| 240 | + and parent.parent is not None |
| 241 | + and parent.parent.type == 'case_statement'): |
| 242 | + depth = get_indent_depth(parent.parent) |
| 243 | + else: |
| 244 | + depth = get_indent_depth(parent) |
| 245 | + |
| 246 | + # Special handling: case/default labels - they are children of |
| 247 | + # case_statement but should be at the case_statement's depth |
| 248 | + # (the 'case' keyword itself) |
| 249 | + if node.type in ('case', 'default'): |
| 250 | + parent = node.parent |
| 251 | + if parent is not None and parent.type == 'case_statement': |
| 252 | + depth = get_indent_depth(parent) |
| 253 | + |
| 254 | + # Special handling: access specifiers (public/private/protected) |
| 255 | + # For now, skip these - deferred to another PR |
| 256 | + if node.type in ('public', 'private', 'protected'): |
| 257 | + new_lines.append(line) |
| 258 | + skipped += 1 |
| 259 | + if verbose: |
| 260 | + print(f" SKIP access spec L{line_idx+1}: {line.rstrip()[:80]}") |
| 261 | + continue |
| 262 | + |
| 263 | + # Special handling: the colon after access specifier or case label |
| 264 | + if node.type == ':': |
| 265 | + parent = node.parent |
| 266 | + if parent is not None and parent.type == 'access_specifier': |
| 267 | + new_lines.append(line) |
| 268 | + skipped += 1 |
| 269 | + continue |
| 270 | + |
| 271 | + # Sanity check: if the line had significant indentation but we |
| 272 | + # computed depth 0, something is likely wrong (parse errors nearby, |
| 273 | + # inline assembly corrupting the AST, etc). Skip to be safe. |
| 274 | + # Threshold of 4 spaces catches most misparses while allowing |
| 275 | + # legitimate depth-0 code with minor (1-3 space) indentation errors. |
| 276 | + space_count = len(leading_ws) |
| 277 | + if depth == 0 and space_count >= 4: |
| 278 | + new_lines.append(line) |
| 279 | + skipped += 1 |
| 280 | + if verbose: |
| 281 | + print(f" SKIP sanity L{line_idx+1}: depth=0 but {space_count} spaces: {line.rstrip()[:80]}") |
| 282 | + continue |
| 283 | + |
| 284 | + # Build the new line: depth tabs + original content (no leading whitespace) |
| 285 | + new_line = '\t' * depth + stripped |
| 286 | + if new_line != line: |
| 287 | + changed += 1 |
| 288 | + if verbose: |
| 289 | + print(f" CHANGE L{line_idx+1} (depth={depth}): '{leading_ws}' -> '{chr(9)*depth}'") |
| 290 | + new_lines.append(new_line) |
| 291 | + |
| 292 | + if changed > 0 and not dry_run: |
| 293 | + with open(filepath, 'w', encoding='cp1252') as f: |
| 294 | + f.write('\n'.join(new_lines)) |
| 295 | + if content.endswith('\n'): |
| 296 | + f.write('\n') |
| 297 | + |
| 298 | + return changed, skipped, len(lines) |
| 299 | + |
| 300 | + |
| 301 | +def main(): |
| 302 | + parser = argparse.ArgumentParser( |
| 303 | + description='Convert leading spaces to tabs using tree-sitter C++ parsing.' |
| 304 | + ) |
| 305 | + parser.add_argument('--dry-run', action='store_true', |
| 306 | + help='Show what would change without modifying files') |
| 307 | + parser.add_argument('--verbose', '-v', action='store_true', |
| 308 | + help='Print details about each changed/skipped line') |
| 309 | + parser.add_argument('--file', type=str, |
| 310 | + help='Process a single file instead of the whole codebase') |
| 311 | + args = parser.parse_args() |
| 312 | + |
| 313 | + script_dir = os.path.dirname(os.path.abspath(__file__)) |
| 314 | + root_dir = os.path.normpath(os.path.join(script_dir, '..', '..')) |
| 315 | + |
| 316 | + if args.file: |
| 317 | + file_list = [args.file] |
| 318 | + else: |
| 319 | + top_dirs = [ |
| 320 | + os.path.join(root_dir, 'Core'), |
| 321 | + os.path.join(root_dir, 'Generals'), |
| 322 | + os.path.join(root_dir, 'GeneralsMD'), |
| 323 | + os.path.join(root_dir, 'Dependencies', 'Utility'), |
| 324 | + ] |
| 325 | + file_list = [] |
| 326 | + for ext in ['*.cpp', '*.h', '*.inl']: |
| 327 | + for top_dir in top_dirs: |
| 328 | + file_list.extend( |
| 329 | + glob.glob(os.path.join(top_dir, '**', ext), recursive=True) |
| 330 | + ) |
| 331 | + |
| 332 | + total_changed = 0 |
| 333 | + total_skipped = 0 |
| 334 | + total_files_modified = 0 |
| 335 | + |
| 336 | + for filepath in sorted(file_list): |
| 337 | + changed, skipped, total_lines = process_file( |
| 338 | + filepath, dry_run=args.dry_run, verbose=args.verbose |
| 339 | + ) |
| 340 | + if changed > 0: |
| 341 | + total_files_modified += 1 |
| 342 | + rel = os.path.relpath(filepath, root_dir) |
| 343 | + action = "Would change" if args.dry_run else "Changed" |
| 344 | + print(f"{action} {changed} lines in {rel}") |
| 345 | + total_changed += changed |
| 346 | + total_skipped += skipped |
| 347 | + |
| 348 | + action = "Would change" if args.dry_run else "Changed" |
| 349 | + print(f"\n{action} {total_changed} lines across {total_files_modified} files") |
| 350 | + print(f"Skipped {total_skipped} lines (continuations, access specifiers, ambiguous)") |
| 351 | + |
| 352 | + |
| 353 | +if __name__ == '__main__': |
| 354 | + main() |
0 commit comments