diff --git a/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java b/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java index 1d6ce194..e099aed8 100644 --- a/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java +++ b/gen.treesitter-ng/src/main/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGenerator.java @@ -28,10 +28,12 @@ import java.io.BufferedReader; import java.io.Reader; +import java.nio.charset.StandardCharsets; import java.util.*; public abstract class AbstractTreeSitterNgGenerator extends TreeGenerator { + private static final String LF = "\n"; private static final String RULES_FILE = "rules.yml"; private static final String YAML_IGNORED = "ignored"; @@ -47,16 +49,22 @@ public abstract class AbstractTreeSitterNgGenerator extends TreeGenerator { } @Override - protected TreeContext generate(Reader r) { + protected TreeContext generate(Reader r) throws java.io.IOException { TSParser parser = new TSParser(); TSLanguage language = getTreeSitterLanguage(); parser.setLanguage(language); - BufferedReader bufferedReader = new BufferedReader(r); - List contentLines = bufferedReader.lines().toList(); - if (contentLines.isEmpty()) + + StringBuilder sb = new StringBuilder(); + char[] buf = new char[8192]; + int n; + while ((n = r.read(buf)) != -1) { + sb.append(buf, 0, n); + } + String content = sb.toString(); + if (content.isEmpty()) return emptyContext(); - String content = String.join(System.lineSeparator(), contentLines); + List contentLines = Arrays.asList(content.split(LF, -1)); TSTree tree = parser.parseString(null, content); Map currentRule = RULES.getOrDefault(getLanguageName(), new HashMap<>()); return generateFromTreeSitterTree(contentLines, currentRule, tree); @@ -70,7 +78,7 @@ private static String getLabel(List contentLines, TSNode node) { List substringLines; // tree-sitter handles string by byte array, so we need this. String startRowStr = contentLines.get(startRow); - byte[] startRowBytes = startRowStr.getBytes(); + byte[] startRowBytes = startRowStr.getBytes(StandardCharsets.UTF_8); if (startRow == endRow) { // endColumn == startRowBytes.length + 1 when the label in tree-sitter contains line separator if (endColumn == startRowBytes.length + 1) { @@ -78,20 +86,20 @@ private static String getLabel(List contentLines, TSNode node) { } else { substringLines = Collections.singletonList(new String( - startRowBytes, startColumn, endColumn - startColumn)); + startRowBytes, startColumn, endColumn - startColumn, StandardCharsets.UTF_8)); } } else { substringLines = new ArrayList<>(); String endRowStr = contentLines.get(endRow); - byte[] endRowBytes = endRowStr.getBytes(); + byte[] endRowBytes = endRowStr.getBytes(StandardCharsets.UTF_8); String startLineSubstring; if (startColumn > startRowBytes.length) { // usually, line separator is not the start char of a tree node label // if this situation happened, just put an empty string at start startLineSubstring = ""; } else { - startLineSubstring = new String(startRowBytes, 0, startColumn); + startLineSubstring = new String(startRowBytes, 0, startColumn, StandardCharsets.UTF_8); } List middleLines = contentLines.subList(startRow + 1, endRow); String endLineSubstring; @@ -99,13 +107,13 @@ private static String getLabel(List contentLines, TSNode node) { endLineSubstring = endRowStr; } else { - endLineSubstring = new String(endRowBytes, 0, endColumn); + endLineSubstring = new String(endRowBytes, 0, endColumn, StandardCharsets.UTF_8); } substringLines.add(startLineSubstring); substringLines.addAll(middleLines); substringLines.add(endLineSubstring); } - return String.join(System.lineSeparator(), substringLines); + return String.join(LF, substringLines); } private static int calculateOffset(List contentLines, TSPoint point) { @@ -113,7 +121,10 @@ private static int calculateOffset(List contentLines, TSPoint point) { int startColumn = point.getColumn(); int offset = 0; for (int i = 0; i < startRow; i++) { - offset += contentLines.get(i).length() + System.lineSeparator().length(); + // Each line in contentLines (except maybe the last) was terminated by LF (\n). + // If the original was CRLF, the CR (\r) is still at the end of the line string. + // .getBytes().length + 1 correctly counts [LineContent] + [LF]. + offset += contentLines.get(i).getBytes(StandardCharsets.UTF_8).length + 1; } offset += startColumn; return offset; diff --git a/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java b/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java index 5950b65e..ccb2abb9 100644 --- a/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java +++ b/gen.treesitter-ng/src/test/java/com/github/gumtreediff/gen/treesitterng/AbstractTreeSitterNgGeneratorTest.java @@ -18,8 +18,11 @@ */ package com.github.gumtreediff.gen.treesitterng; +import com.github.gumtreediff.tree.Tree; +import com.github.gumtreediff.tree.TreeContext; import org.junit.jupiter.api.Test; +import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -27,6 +30,47 @@ import static org.junit.jupiter.api.Assertions.*; public class AbstractTreeSitterNgGeneratorTest { + private final PythonTreeSitterNgTreeGenerator generator = new PythonTreeSitterNgTreeGenerator(); + + @Test + public void OffsetConsistency_testLFOffsets() throws IOException { + // Line 1: "x = 1\n" (5 chars + 1 LF = 6 bytes) + // Line 2: "y = 2" + String content = "x = 1\ny = 2"; + TreeContext ctx = generator.generateFrom().string(content); + + // Find the second assignment (y = 2) + // Root (module) -> children[1] (expression_statement) + Tree yAssignment = ctx.getRoot().getChild(1); + assertEquals("expression_statement", yAssignment.getType().name); + assertEquals(6, yAssignment.getPos(), "Line 2 should start at byte offset 6 for LF content"); + } + + @Test + public void OffsetConsistency_testCRLFOffsets() throws IOException { + // Line 1: "x = 1\r\n" (5 chars + 2 CRLF = 7 bytes) + // Line 2: "y = 2" + String content = "x = 1\r\ny = 2"; + TreeContext ctx = generator.generateFrom().string(content); + + Tree yAssignment = ctx.getRoot().getChild(1); + assertEquals("expression_statement", yAssignment.getType().name); + assertEquals(7, yAssignment.getPos(), "Line 2 should start at byte offset 7 for CRLF content"); + } + + @Test + public void OffsetConsistency_testMultiByteOffsets() throws IOException { + // Line 1: "# 🐍\n" + // '#' (1) + ' ' (1) + '🐍' (4 bytes in UTF-8) + '\n' (1) = 7 bytes total + // Line 2: "x = 1" + String content = "# 🐍\nx = 1"; + TreeContext ctx = generator.generateFrom().string(content); + + Tree xAssignment = ctx.getRoot().getChild(1); + assertEquals("expression_statement", xAssignment.getType().name); + assertEquals(7, xAssignment.getPos(), "Line 2 should start at byte offset 7 after a 4-byte emoji and LF"); + } + @Test public void testMatchNodeOrAncestorTypes() { MockTypeOnlyTreeSitterNode root = new MockTypeOnlyTreeSitterNode();