Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@

import java.io.BufferedReader;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.util.*;

public abstract class AbstractTreeSitterNgGenerator extends TreeGenerator {

private static final String LF = "\n";
private static final String RULES_FILE = "rules.yml";

private static final String YAML_IGNORED = "ignored";
Expand All @@ -47,16 +49,22 @@ public abstract class AbstractTreeSitterNgGenerator extends TreeGenerator {
}

@Override
protected TreeContext generate(Reader r) {
protected TreeContext generate(Reader r) throws java.io.IOException {
TSParser parser = new TSParser();
TSLanguage language = getTreeSitterLanguage();
parser.setLanguage(language);
BufferedReader bufferedReader = new BufferedReader(r);
List<String> contentLines = bufferedReader.lines().toList();
if (contentLines.isEmpty())

StringBuilder sb = new StringBuilder();
char[] buf = new char[8192];
int n;
while ((n = r.read(buf)) != -1) {
sb.append(buf, 0, n);
}
String content = sb.toString();
if (content.isEmpty())
return emptyContext();

String content = String.join(System.lineSeparator(), contentLines);
List<String> contentLines = Arrays.asList(content.split(LF, -1));
TSTree tree = parser.parseString(null, content);
Map<String, Object> currentRule = RULES.getOrDefault(getLanguageName(), new HashMap<>());
return generateFromTreeSitterTree(contentLines, currentRule, tree);
Expand All @@ -70,50 +78,53 @@ private static String getLabel(List<String> contentLines, TSNode node) {
List<String> substringLines;
// tree-sitter handles string by byte array, so we need this.
String startRowStr = contentLines.get(startRow);
byte[] startRowBytes = startRowStr.getBytes();
byte[] startRowBytes = startRowStr.getBytes(StandardCharsets.UTF_8);
if (startRow == endRow) {
// endColumn == startRowBytes.length + 1 when the label in tree-sitter contains line separator
if (endColumn == startRowBytes.length + 1) {
substringLines = Collections.singletonList(startRowStr);
}
else {
substringLines = Collections.singletonList(new String(
startRowBytes, startColumn, endColumn - startColumn));
startRowBytes, startColumn, endColumn - startColumn, StandardCharsets.UTF_8));
}
}
else {
substringLines = new ArrayList<>();
String endRowStr = contentLines.get(endRow);
byte[] endRowBytes = endRowStr.getBytes();
byte[] endRowBytes = endRowStr.getBytes(StandardCharsets.UTF_8);
String startLineSubstring;
if (startColumn > startRowBytes.length) {
// usually, line separator is not the start char of a tree node label
// if this situation happened, just put an empty string at start
startLineSubstring = "";
} else {
startLineSubstring = new String(startRowBytes, 0, startColumn);
startLineSubstring = new String(startRowBytes, 0, startColumn, StandardCharsets.UTF_8);
}
List<String> middleLines = contentLines.subList(startRow + 1, endRow);
String endLineSubstring;
if (endColumn > endRowStr.length()) {
endLineSubstring = endRowStr;
}
else {
endLineSubstring = new String(endRowBytes, 0, endColumn);
endLineSubstring = new String(endRowBytes, 0, endColumn, StandardCharsets.UTF_8);
}
substringLines.add(startLineSubstring);
substringLines.addAll(middleLines);
substringLines.add(endLineSubstring);
}
return String.join(System.lineSeparator(), substringLines);
return String.join(LF, substringLines);
}

private static int calculateOffset(List<String> contentLines, TSPoint point) {
int startRow = point.getRow();
int startColumn = point.getColumn();
int offset = 0;
for (int i = 0; i < startRow; i++) {
offset += contentLines.get(i).length() + System.lineSeparator().length();
// Each line in contentLines (except maybe the last) was terminated by LF (\n).
// If the original was CRLF, the CR (\r) is still at the end of the line string.
// .getBytes().length + 1 correctly counts [LineContent] + [LF].
offset += contentLines.get(i).getBytes(StandardCharsets.UTF_8).length + 1;
}
offset += startColumn;
return offset;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,59 @@
*/
package com.github.gumtreediff.gen.treesitterng;

import com.github.gumtreediff.tree.Tree;
import com.github.gumtreediff.tree.TreeContext;
import org.junit.jupiter.api.Test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static com.github.gumtreediff.gen.treesitterng.AbstractTreeSitterNgGenerator.matchNodeOrAncestorTypes;
import static org.junit.jupiter.api.Assertions.*;

public class AbstractTreeSitterNgGeneratorTest {
private final PythonTreeSitterNgTreeGenerator generator = new PythonTreeSitterNgTreeGenerator();

@Test
public void OffsetConsistency_testLFOffsets() throws IOException {
// Line 1: "x = 1\n" (5 chars + 1 LF = 6 bytes)
// Line 2: "y = 2"
String content = "x = 1\ny = 2";
TreeContext ctx = generator.generateFrom().string(content);

// Find the second assignment (y = 2)
// Root (module) -> children[1] (expression_statement)
Tree yAssignment = ctx.getRoot().getChild(1);
assertEquals("expression_statement", yAssignment.getType().name);
assertEquals(6, yAssignment.getPos(), "Line 2 should start at byte offset 6 for LF content");
}

@Test
public void OffsetConsistency_testCRLFOffsets() throws IOException {
// Line 1: "x = 1\r\n" (5 chars + 2 CRLF = 7 bytes)
// Line 2: "y = 2"
String content = "x = 1\r\ny = 2";
TreeContext ctx = generator.generateFrom().string(content);

Tree yAssignment = ctx.getRoot().getChild(1);
assertEquals("expression_statement", yAssignment.getType().name);
assertEquals(7, yAssignment.getPos(), "Line 2 should start at byte offset 7 for CRLF content");
}

@Test
public void OffsetConsistency_testMultiByteOffsets() throws IOException {
// Line 1: "# 🐍\n"
// '#' (1) + ' ' (1) + '🐍' (4 bytes in UTF-8) + '\n' (1) = 7 bytes total
// Line 2: "x = 1"
String content = "# 🐍\nx = 1";
TreeContext ctx = generator.generateFrom().string(content);

Tree xAssignment = ctx.getRoot().getChild(1);
assertEquals("expression_statement", xAssignment.getType().name);
assertEquals(7, xAssignment.getPos(), "Line 2 should start at byte offset 7 after a 4-byte emoji and LF");
}

@Test
public void testMatchNodeOrAncestorTypes() {
MockTypeOnlyTreeSitterNode root = new MockTypeOnlyTreeSitterNode();
Expand Down