apache
diff --git a/‎parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java‎
Lines changed: 189 additions & 0 deletions b/‎parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java‎
Lines changed: 189 additions & 0 deletions
diff --git a/‎parquet-hadoop/src/test/resources/alp_arade.parquet‎
176 KB b/‎parquet-hadoop/src/test/resources/alp_arade.parquet‎
176 KB
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.hadoop;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.net.URISyntaxException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.column.Encoding;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.hadoop.metadata.BlockMetaData;
+import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
+import org.junit.Test;
+
+/**
+ * Integration test for reading ALP (Adaptive Lossless floating-Point) encoded
+ * parquet files generated by the C++ implementation and verifying correctness
+ * against expected CSV data.
+ *
+ * <p>The test parquet files were generated using the generate_alp_parquet C++
+ * utility from Arrow, which encodes floating-point CSV datasets using ALP encoding.
+ */
+public class TestInteropAlpEncoding {
+
+  private static Path resourcePath(String name) {
+    try {
+      return new Path(TestInteropAlpEncoding.class.getResource("/" + name).toURI());
+    } catch (URISyntaxException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  /**
+   * Read the ALP-encoded arade parquet file (4 double columns, 15000 rows)
+   * and verify all values match the expected CSV.
+   */
+  @Test
+  public void testReadAlpAradeParquet() throws IOException {
+    Path parquetPath = resourcePath("alp_arade.parquet");
+    String[] columnNames = {"value1", "value2", "value3", "value4"};
+    int expectedRows = 15000;
+
+    // Read expected values from CSV
+    double[][] expected = readExpectedCsv("/alp_arade_expect.csv", columnNames.length, expectedRows);
+
+    // Read parquet file using GroupReadSupport
+    List<Group> rows = readParquetGroups(parquetPath);
+    assertEquals("Row count should match", expectedRows, rows.size());
+
+    // Verify ALP encoding is used in metadata
+    verifyAlpEncoding(parquetPath);
+
+    // Compare all values
+    for (int r = 0; r < expectedRows; r++) {
+      Group group = rows.get(r);
+      for (int c = 0; c < columnNames.length; c++) {
+        double actual = group.getDouble(columnNames[c], 0);
+        assertEquals(
+            String.format("Mismatch at row %d, column %s", r, columnNames[c]),
+            Double.doubleToLongBits(expected[c][r]),
+            Double.doubleToLongBits(actual));
+      }
+    }
+  }
+
+  /**
+   * Read the ALP-encoded spotify1 parquet file (9 double columns, 15000 rows)
+   * and verify all values match the expected CSV.
+   */
+  @Test
+  public void testReadAlpSpotify1Parquet() throws IOException {
+    Path parquetPath = resourcePath("alp_spotify1.parquet");
+    String[] columnNames = {
+      "danceability",
+      "energy",
+      "loudness",
+      "speechiness",
+      "acousticness",
+      "instrumentalness",
+      "liveness",
+      "valence",
+      "tempo"
+    };
+    int expectedRows = 15000;
+
+    // Read expected values from CSV
+    double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv", columnNames.length, expectedRows);
+
+    // Read parquet file using GroupReadSupport
+    List<Group> rows = readParquetGroups(parquetPath);
+    assertEquals("Row count should match", expectedRows, rows.size());
+
+    // Verify ALP encoding is used in metadata
+    verifyAlpEncoding(parquetPath);
+
+    // Compare all values
+    for (int r = 0; r < expectedRows; r++) {
+      Group group = rows.get(r);
+      for (int c = 0; c < columnNames.length; c++) {
+        double actual = group.getDouble(columnNames[c], 0);
+        assertEquals(
+            String.format("Mismatch at row %d, column %s", r, columnNames[c]),
+            Double.doubleToLongBits(expected[c][r]),
+            Double.doubleToLongBits(actual));
+      }
+    }
+  }
+
+  private List<Group> readParquetGroups(Path path) throws IOException {
+    List<Group> rows = new ArrayList<>();
+    try (ParquetReader<Group> reader =
+        ParquetReader.builder(new GroupReadSupport(), path).build()) {
+      Group group;
+      while ((group = reader.read()) != null) {
+        rows.add(group);
+      }
+    }
+    return rows;
+  }
+
+  private void verifyAlpEncoding(Path path) throws IOException {
+    try (ParquetFileReader reader = ParquetFileReader.open(org.apache.parquet.hadoop.util.HadoopInputFile.fromPath(
+        path, new org.apache.hadoop.conf.Configuration()))) {
+      List<BlockMetaData> blocks = reader.getFooter().getBlocks();
+      for (BlockMetaData block : blocks) {
+        for (ColumnChunkMetaData column : block.getColumns()) {
+          assertNotNull(
+              "Column " + column.getPath() + " should have encoding stats", column.getEncodingStats());
+          boolean hasAlp = column.getEncodings().contains(Encoding.ALP);
+          assertEquals("Column " + column.getPath() + " should use ALP encoding", true, hasAlp);
+        }
+      }
+    }
+  }
+
+  /**
+   * Parse expected CSV into column arrays.
+   * CSV format: header row, then data rows with comma-separated double values.
+   */
+  private double[][] readExpectedCsv(String resourcePath, int numColumns, int expectedRows) throws IOException {
+    double[][] columns = new double[numColumns][expectedRows];
+    try (InputStream is = getClass().getResourceAsStream(resourcePath);
+        BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
+      assertNotNull("CSV resource not found: " + resourcePath, is);
+
+      // Skip header
+      String header = br.readLine();
+      assertNotNull("CSV should have a header", header);
+
+      int row = 0;
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] parts = line.split(",");
+        assertEquals("CSV row " + row + " should have " + numColumns + " columns", numColumns, parts.length);
+        for (int c = 0; c < numColumns; c++) {
+          columns[c][row] = Double.parseDouble(parts[c]);
+        }
+        row++;
+      }
+      assertEquals("CSV should have " + expectedRows + " data rows", expectedRows, row);
+    }
+    return columns;
+  }
+}