Skip to content

Commit d5b0b33

Browse files
committed
style: Add tree-sitter based script to convert leading spaces to tabs
1 parent 476a57c commit d5b0b33

1 file changed

Lines changed: 354 additions & 0 deletions

File tree

Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
"""
2+
Convert leading spaces to tabs using tree-sitter for accurate C++ parsing.
3+
4+
Uses the tree-sitter C++ parser to determine the correct indentation depth
5+
of each line, then replaces leading whitespace with the correct number of tabs.
6+
7+
Only converts lines where the tool is confident about the correct depth.
8+
Lines inside block comments, continuation lines, and ambiguous cases are
9+
skipped and logged.
10+
11+
Requirements:
12+
pip install tree-sitter tree-sitter-cpp
13+
14+
Usage:
15+
python convert_leading_spaces_to_tabs.py [--dry-run] [--verbose]
16+
"""
17+
18+
import argparse
19+
import glob
20+
import os
21+
22+
import tree_sitter_cpp as tscpp
23+
from tree_sitter import Language, Parser
24+
25+
26+
CPP_LANGUAGE = Language(tscpp.language())
27+
28+
29+
# Node types that create an indentation level for their children.
30+
SCOPE_CREATING_TYPES = frozenset({
31+
'compound_statement',
32+
'field_declaration_list',
33+
'declaration_list', # namespace body
34+
'enumerator_list',
35+
'initializer_list',
36+
'case_statement',
37+
})
38+
39+
40+
def get_indent_depth(node):
41+
"""Walk up the AST and count scope-creating ancestors to determine indent depth."""
42+
depth = 0
43+
current = node.parent
44+
45+
while current is not None:
46+
if current.type in SCOPE_CREATING_TYPES:
47+
# Special case: a compound_statement that is a direct child of
48+
# case_statement does not add an extra indent level. The case_statement
49+
# already provides the indent, and the braces are at the case level.
50+
if (current.type == 'compound_statement'
51+
and current.parent is not None
52+
and current.parent.type == 'case_statement'):
53+
pass # don't increment
54+
else:
55+
depth += 1
56+
current = current.parent
57+
58+
return depth
59+
60+
61+
def is_continuation_line(node, line_num):
62+
"""Check if this line is a continuation of a multi-line statement.
63+
64+
A continuation line is one where the innermost meaningful AST node
65+
started on a previous line. We skip these because their alignment
66+
is a style choice we don't want to change in this PR.
67+
"""
68+
if node is None:
69+
return False
70+
71+
# Walk up to find the nearest statement-level node
72+
current = node
73+
while current is not None:
74+
# If this node starts on our line, it's not a continuation
75+
if current.start_point[0] == line_num:
76+
current = current.parent
77+
continue
78+
79+
# If the node started on a previous line and is a statement-level
80+
# construct, this is a continuation line
81+
if current.start_point[0] < line_num:
82+
if current.type in (
83+
'declaration', 'field_declaration', 'expression_statement',
84+
'return_statement', 'init_declarator', 'argument_list',
85+
'parameter_list', 'condition_clause', 'call_expression',
86+
'binary_expression', 'assignment_expression',
87+
'function_declarator', 'template_argument_list',
88+
'field_initializer_list', 'field_initializer',
89+
'base_class_clause', 'initializer_pair',
90+
):
91+
return True
92+
current = current.parent
93+
94+
return False
95+
96+
97+
98+
99+
100+
def process_file(filepath, dry_run=False, verbose=False):
101+
"""Process a single file, converting leading spaces to tabs.
102+
103+
Returns (changed, skipped, total_lines) tuple.
104+
"""
105+
parser = Parser(CPP_LANGUAGE)
106+
107+
try:
108+
with open(filepath, 'r', encoding='cp1252') as f:
109+
content = f.read()
110+
except (UnicodeDecodeError, OSError):
111+
return 0, 0, 0
112+
113+
lines = content.split('\n')
114+
# Preserve original line endings
115+
if content.endswith('\n'):
116+
lines = lines[:-1] # split adds an empty string after trailing \n
117+
118+
code_bytes = content.encode('utf-8')
119+
tree = parser.parse(code_bytes)
120+
121+
# If the file has excessive parse errors, the AST is unreliable - skip it.
122+
# Only count top-level errors (not deeply nested ones from macro expansions).
123+
error_count = sum(1 for child in tree.root_node.children if child.type == 'ERROR')
124+
if error_count > 10:
125+
if verbose:
126+
print(f" SKIP file (too many top-level parse errors: {error_count})")
127+
return 0, 0, len(lines)
128+
129+
new_lines = []
130+
changed = 0
131+
skipped = 0
132+
in_block_comment = False
133+
in_macro_continuation = False
134+
135+
for line_idx, line in enumerate(lines):
136+
# Track block comments manually for reliability
137+
stripped = line.lstrip()
138+
line_continues_macro = line.rstrip().endswith('\\')
139+
140+
if in_block_comment:
141+
# Inside a block comment - don't touch
142+
new_lines.append(line)
143+
if '*/' in line:
144+
in_block_comment = False
145+
continue
146+
147+
if stripped.startswith('/*'):
148+
if '*/' not in stripped or stripped.index('*/') < stripped.index('/*') + 2:
149+
in_block_comment = '*/' not in stripped[2:]
150+
new_lines.append(line)
151+
continue
152+
153+
# Multi-line macro continuations - don't touch
154+
if in_macro_continuation:
155+
new_lines.append(line)
156+
in_macro_continuation = line_continues_macro
157+
continue
158+
159+
# Blank lines - preserve as-is
160+
if stripped == '':
161+
new_lines.append(line)
162+
continue
163+
164+
# Preprocessor directives - always column 0
165+
if stripped.startswith('#'):
166+
new_lines.append(line)
167+
in_macro_continuation = line_continues_macro
168+
continue
169+
170+
# Check if the line already uses tabs (no leading spaces)
171+
leading_ws = line[:len(line) - len(line.lstrip())]
172+
if ' ' not in leading_ws:
173+
# Already all tabs (or no indentation) - keep as-is
174+
new_lines.append(line)
175+
continue
176+
177+
# Skip mixed tab+space lines (e.g. tab then spaces for alignment)
178+
# These need careful handling deferred to a later PR
179+
if '\t' in leading_ws and ' ' in leading_ws:
180+
new_lines.append(line)
181+
skipped += 1
182+
if verbose:
183+
print(f" SKIP mixed L{line_idx+1}: {line.rstrip()[:80]}")
184+
continue
185+
186+
# This line has leading spaces. Use tree-sitter to determine depth.
187+
# Find the AST node at the first non-whitespace character.
188+
col = len(leading_ws)
189+
node = tree.root_node.descendant_for_point_range(
190+
(line_idx, col), (line_idx, col)
191+
)
192+
193+
if node is None:
194+
new_lines.append(line)
195+
skipped += 1
196+
continue
197+
198+
# Skip if the node or any ancestor is an ERROR node (parse failure)
199+
error_ancestor = False
200+
check = node
201+
while check is not None:
202+
if check.type == 'ERROR':
203+
error_ancestor = True
204+
break
205+
check = check.parent
206+
if error_ancestor:
207+
new_lines.append(line)
208+
skipped += 1
209+
if verbose:
210+
print(f" SKIP parse error L{line_idx+1}: {line.rstrip()[:80]}")
211+
continue
212+
213+
# For comment nodes, we can reindent // line comments but
214+
# should skip /* */ block comments (their internal formatting
215+
# is intentional). Block comments are already handled by the
216+
# in_block_comment tracking above, so single-line comments
217+
# that tree-sitter reports as 'comment' are fine to reindent.
218+
# However, single-line /* */ comments on their own line are also
219+
# fine to reindent.
220+
221+
# Skip continuation lines
222+
if is_continuation_line(node, line_idx):
223+
new_lines.append(line)
224+
skipped += 1
225+
if verbose:
226+
print(f" SKIP continuation L{line_idx+1}: {line.rstrip()[:80]}")
227+
continue
228+
229+
# Calculate expected depth
230+
depth = get_indent_depth(node)
231+
232+
# Special handling: the opening/closing braces of a compound_statement
233+
# should be at the parent's depth, not the compound_statement's depth.
234+
if node.type in ('{', '}'):
235+
parent = node.parent
236+
if parent is not None and parent.type in SCOPE_CREATING_TYPES:
237+
# For a compound_statement inside case_statement, braces should
238+
# be at the case label level (case_statement's own depth)
239+
if (parent.type == 'compound_statement'
240+
and parent.parent is not None
241+
and parent.parent.type == 'case_statement'):
242+
depth = get_indent_depth(parent.parent)
243+
else:
244+
depth = get_indent_depth(parent)
245+
246+
# Special handling: case/default labels - they are children of
247+
# case_statement but should be at the case_statement's depth
248+
# (the 'case' keyword itself)
249+
if node.type in ('case', 'default'):
250+
parent = node.parent
251+
if parent is not None and parent.type == 'case_statement':
252+
depth = get_indent_depth(parent)
253+
254+
# Special handling: access specifiers (public/private/protected)
255+
# For now, skip these - deferred to another PR
256+
if node.type in ('public', 'private', 'protected'):
257+
new_lines.append(line)
258+
skipped += 1
259+
if verbose:
260+
print(f" SKIP access spec L{line_idx+1}: {line.rstrip()[:80]}")
261+
continue
262+
263+
# Special handling: the colon after access specifier or case label
264+
if node.type == ':':
265+
parent = node.parent
266+
if parent is not None and parent.type == 'access_specifier':
267+
new_lines.append(line)
268+
skipped += 1
269+
continue
270+
271+
# Sanity check: if the line had significant indentation but we
272+
# computed depth 0, something is likely wrong (parse errors nearby,
273+
# inline assembly corrupting the AST, etc). Skip to be safe.
274+
# Threshold of 4 spaces catches most misparses while allowing
275+
# legitimate depth-0 code with minor (1-3 space) indentation errors.
276+
space_count = len(leading_ws)
277+
if depth == 0 and space_count >= 4:
278+
new_lines.append(line)
279+
skipped += 1
280+
if verbose:
281+
print(f" SKIP sanity L{line_idx+1}: depth=0 but {space_count} spaces: {line.rstrip()[:80]}")
282+
continue
283+
284+
# Build the new line: depth tabs + original content (no leading whitespace)
285+
new_line = '\t' * depth + stripped
286+
if new_line != line:
287+
changed += 1
288+
if verbose:
289+
print(f" CHANGE L{line_idx+1} (depth={depth}): '{leading_ws}' -> '{chr(9)*depth}'")
290+
new_lines.append(new_line)
291+
292+
if changed > 0 and not dry_run:
293+
with open(filepath, 'w', encoding='cp1252') as f:
294+
f.write('\n'.join(new_lines))
295+
if content.endswith('\n'):
296+
f.write('\n')
297+
298+
return changed, skipped, len(lines)
299+
300+
301+
def main():
302+
parser = argparse.ArgumentParser(
303+
description='Convert leading spaces to tabs using tree-sitter C++ parsing.'
304+
)
305+
parser.add_argument('--dry-run', action='store_true',
306+
help='Show what would change without modifying files')
307+
parser.add_argument('--verbose', '-v', action='store_true',
308+
help='Print details about each changed/skipped line')
309+
parser.add_argument('--file', type=str,
310+
help='Process a single file instead of the whole codebase')
311+
args = parser.parse_args()
312+
313+
script_dir = os.path.dirname(os.path.abspath(__file__))
314+
root_dir = os.path.normpath(os.path.join(script_dir, '..', '..'))
315+
316+
if args.file:
317+
file_list = [args.file]
318+
else:
319+
top_dirs = [
320+
os.path.join(root_dir, 'Core'),
321+
os.path.join(root_dir, 'Generals'),
322+
os.path.join(root_dir, 'GeneralsMD'),
323+
os.path.join(root_dir, 'Dependencies', 'Utility'),
324+
]
325+
file_list = []
326+
for ext in ['*.cpp', '*.h', '*.inl']:
327+
for top_dir in top_dirs:
328+
file_list.extend(
329+
glob.glob(os.path.join(top_dir, '**', ext), recursive=True)
330+
)
331+
332+
total_changed = 0
333+
total_skipped = 0
334+
total_files_modified = 0
335+
336+
for filepath in sorted(file_list):
337+
changed, skipped, total_lines = process_file(
338+
filepath, dry_run=args.dry_run, verbose=args.verbose
339+
)
340+
if changed > 0:
341+
total_files_modified += 1
342+
rel = os.path.relpath(filepath, root_dir)
343+
action = "Would change" if args.dry_run else "Changed"
344+
print(f"{action} {changed} lines in {rel}")
345+
total_changed += changed
346+
total_skipped += skipped
347+
348+
action = "Would change" if args.dry_run else "Changed"
349+
print(f"\n{action} {total_changed} lines across {total_files_modified} files")
350+
print(f"Skipped {total_skipped} lines (continuations, access specifiers, ambiguous)")
351+
352+
353+
if __name__ == '__main__':
354+
main()

0 commit comments

Comments
 (0)