ALP: Add writer pipeline integration and bidirectional cross-language tests

sfc-gh-pgaur · sfc-gh-pgaur · commit e358c3c1c77e · 2026-03-07T23:20:29.000Z
Wire ALP encoding into Java's writer pipeline so ParquetWriter can produce
ALP-encoded parquet files. This enables bidirectional interop testing: Java
writes ALP parquet that C++ reads, complementing the existing C++ writes /
Java reads direction.

- Add alpEnabled ColumnProperty to ParquetProperties with isAlpEnabled()
- Update DefaultV2ValuesWriterFactory to select ALP writers for float/double
- Add withAlpEncoding() builder methods to ParquetWriter
- Add GenerateAlpParquet utility to produce test files from CSV data
- Add Java interop tests reading back Java-generated ALP parquet files
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java b/parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java
@@ -50,6 +50,7 @@ public class ParquetProperties {
   public static final int DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE;
   public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = true;
   public static final boolean DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED = false;
+  public static final boolean DEFAULT_IS_ALP_ENABLED = false;
   public static final WriterVersion DEFAULT_WRITER_VERSION = WriterVersion.PARQUET_1_0;
   public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true;
   public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100;
@@ -132,6 +133,7 @@ public static WriterVersion fromString(String name) {
   private final int pageRowCountLimit;
   private final boolean pageWriteChecksumEnabled;
   private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
+  private final ColumnProperty<Boolean> alpEnabled;
   private final Map<String, String> extraMetaData;
   private final ColumnProperty<Boolean> statistics;
   private final ColumnProperty<Boolean> sizeStatistics;
@@ -164,6 +166,7 @@ private ParquetProperties(Builder builder) {
     this.pageRowCountLimit = builder.pageRowCountLimit;
     this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
     this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
+    this.alpEnabled = builder.alpEnabled.build();
     this.extraMetaData = builder.extraMetaData;
     this.statistics = builder.statistics.build();
     this.sizeStatistics = builder.sizeStatistics.build();
@@ -259,6 +262,20 @@ public boolean isByteStreamSplitEnabled(ColumnDescriptor column) {
     }
   }
 
+  /**
+   * Returns true if ALP encoding is enabled for the given column.
+   * ALP encoding is only applicable to FLOAT and DOUBLE columns.
+   */
+  public boolean isAlpEnabled(ColumnDescriptor column) {
+    switch (column.getPrimitiveType().getPrimitiveTypeName()) {
+      case FLOAT:
+      case DOUBLE:
+        return alpEnabled.getValue(column);
+      default:
+        return false;
+    }
+  }
+
   public ByteBufferAllocator getAllocator() {
     return allocator;
   }
@@ -416,6 +433,7 @@ public static class Builder {
     private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT;
     private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
     private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
+    private final ColumnProperty.Builder<Boolean> alpEnabled;
     private Map<String, String> extraMetaData = new HashMap<>();
     private final ColumnProperty.Builder<Boolean> statistics;
     private final ColumnProperty.Builder<Boolean> sizeStatistics;
@@ -427,6 +445,7 @@ private Builder() {
               DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED
                   ? ByteStreamSplitMode.FLOATING_POINT
                   : ByteStreamSplitMode.NONE);
+      alpEnabled = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_ALP_ENABLED);
       bloomFilterEnabled = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_ENABLED);
       bloomFilterNDVs = ColumnProperty.<Long>builder().withDefaultValue(null);
       bloomFilterFPPs = ColumnProperty.<Double>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_FPP);
@@ -457,6 +476,7 @@ private Builder(ParquetProperties toCopy) {
       this.numBloomFilterCandidates = ColumnProperty.builder(toCopy.numBloomFilterCandidates);
       this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
       this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
+      this.alpEnabled = ColumnProperty.builder(toCopy.alpEnabled);
       this.extraMetaData = toCopy.extraMetaData;
       this.statistics = ColumnProperty.builder(toCopy.statistics);
       this.sizeStatistics = ColumnProperty.builder(toCopy.sizeStatistics);
@@ -534,6 +554,29 @@ public Builder withExtendedByteStreamSplitEncoding(boolean enable) {
       return this;
     }
 
+    /**
+     * Enable or disable ALP encoding for FLOAT and DOUBLE columns.
+     *
+     * @param enable whether ALP encoding should be enabled
+     * @return this builder for method chaining.
+     */
+    public Builder withAlpEncoding(boolean enable) {
+      this.alpEnabled.withDefaultValue(enable);
+      return this;
+    }
+
+    /**
+     * Enable or disable ALP encoding for the specified column.
+     *
+     * @param columnPath the path of the column (dot-string)
+     * @param enable     whether ALP encoding should be enabled
+     * @return this builder for method chaining.
+     */
+    public Builder withAlpEncoding(String columnPath, boolean enable) {
+      this.alpEnabled.withValue(columnPath, enable);
+      return this;
+    }
+
     /**
      * Set the Parquet format dictionary page size.
      *
diff --git a/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java b/parquet-column/src/main/java/org/apache/parquet/column/values/factory/DefaultV2ValuesWriterFactory.java
@@ -25,6 +25,7 @@
 import org.apache.parquet.column.Encoding;
 import org.apache.parquet.column.ParquetProperties;
 import org.apache.parquet.column.values.ValuesWriter;
+import org.apache.parquet.column.values.alp.AlpValuesWriter;
 import org.apache.parquet.column.values.bytestreamsplit.ByteStreamSplitValuesWriter;
 import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriterForInteger;
 import org.apache.parquet.column.values.delta.DeltaBinaryPackingValuesWriterForLong;
@@ -159,7 +160,9 @@ private ValuesWriter getInt96ValuesWriter(ColumnDescriptor path) {
 
   private ValuesWriter getDoubleValuesWriter(ColumnDescriptor path) {
     final ValuesWriter fallbackWriter;
-    if (this.parquetProperties.isByteStreamSplitEnabled(path)) {
+    if (this.parquetProperties.isAlpEnabled(path)) {
+      fallbackWriter = new AlpValuesWriter.DoubleAlpValuesWriter();
+    } else if (this.parquetProperties.isByteStreamSplitEnabled(path)) {
       fallbackWriter = new ByteStreamSplitValuesWriter.DoubleByteStreamSplitValuesWriter(
           parquetProperties.getInitialSlabSize(),
           parquetProperties.getPageSizeThreshold(),
@@ -176,7 +179,9 @@ private ValuesWriter getDoubleValuesWriter(ColumnDescriptor path) {
 
   private ValuesWriter getFloatValuesWriter(ColumnDescriptor path) {
     final ValuesWriter fallbackWriter;
-    if (this.parquetProperties.isByteStreamSplitEnabled(path)) {
+    if (this.parquetProperties.isAlpEnabled(path)) {
+      fallbackWriter = new AlpValuesWriter.FloatAlpValuesWriter();
+    } else if (this.parquetProperties.isByteStreamSplitEnabled(path)) {
       fallbackWriter = new ByteStreamSplitValuesWriter.FloatByteStreamSplitValuesWriter(
           parquetProperties.getInitialSlabSize(),
           parquetProperties.getPageSizeThreshold(),
diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetWriter.java
@@ -705,6 +705,16 @@ public SELF withByteStreamSplitEncoding(String columnPath, boolean enableByteStr
       return self();
     }
 
+    public SELF withAlpEncoding(boolean enableAlp) {
+      encodingPropsBuilder.withAlpEncoding(enableAlp);
+      return self();
+    }
+
+    public SELF withAlpEncoding(String columnPath, boolean enableAlp) {
+      encodingPropsBuilder.withAlpEncoding(columnPath, enableAlp);
+      return self();
+    }
+
     /**
      * Enable or disable dictionary encoding of the specified column for the constructed writer.
      *
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/GenerateAlpParquet.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/GenerateAlpParquet.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.hadoop;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroupFactory;
+import org.apache.parquet.hadoop.example.ExampleParquetWriter;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
+import org.apache.parquet.schema.Types;
+
+/**
+ * Standalone utility to generate ALP-encoded parquet files from CSV test data.
+ *
+ * <p>Reads the existing expect CSV files (alp_spotify1_expect.csv, alp_arade_expect.csv)
+ * from test resources and writes ALP-encoded parquet files using the Java ALP encoder.
+ *
+ * <p>Usage: java GenerateAlpParquet [output_directory]
+ * If no output directory is specified, files are written to the current directory.
+ */
+public class GenerateAlpParquet {
+
+  public static void main(String[] args) throws IOException {
+    String outputDir = args.length > 0 ? args[0] : ".";
+    Files.createDirectories(Paths.get(outputDir));
+
+    generateAlpParquet("/alp_arade_expect.csv", outputDir + "/alp_java_arade.parquet");
+    System.out.println("Generated: " + outputDir + "/alp_java_arade.parquet");
+
+    generateAlpParquet("/alp_spotify1_expect.csv", outputDir + "/alp_java_spotify1.parquet");
+    System.out.println("Generated: " + outputDir + "/alp_java_spotify1.parquet");
+  }
+
+  private static void generateAlpParquet(String csvResource, String outputPath) throws IOException {
+    // Read CSV
+    String[] columnNames;
+    List<double[]> rows = new ArrayList<>();
+
+    try (InputStream is = GenerateAlpParquet.class.getResourceAsStream(csvResource);
+        BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
+      // Parse header
+      String header = br.readLine();
+      columnNames = header.split(",");
+
+      // Parse data rows
+      String line;
+      while ((line = br.readLine()) != null) {
+        String[] parts = line.split(",");
+        double[] values = new double[parts.length];
+        for (int i = 0; i < parts.length; i++) {
+          values[i] = Double.parseDouble(parts[i]);
+        }
+        rows.add(values);
+      }
+    }
+
+    // Build schema: all required DOUBLE columns
+    Types.MessageTypeBuilder schemaBuilder = Types.buildMessage();
+    for (String name : columnNames) {
+      schemaBuilder.required(PrimitiveTypeName.DOUBLE).named(name);
+    }
+    MessageType schema = schemaBuilder.named("schema");
+
+    // Delete output file if it exists
+    java.io.File outFile = new java.io.File(outputPath);
+    if (outFile.exists()) {
+      outFile.delete();
+    }
+
+    // Write ALP-encoded parquet
+    Path path = new Path(outFile.getAbsolutePath());
+    SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
+    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
+        .withType(schema)
+        .withWriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0)
+        .withAlpEncoding(true)
+        .withDictionaryEncoding(false)
+        .build()) {
+      for (double[] row : rows) {
+        Group group = groupFactory.newGroup();
+        for (int c = 0; c < columnNames.length; c++) {
+          group.append(columnNames[c], row[c]);
+        }
+        writer.write(group);
+      }
+    }
+  }
+}
diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java
@@ -131,6 +131,74 @@ public void testReadAlpSpotify1Parquet() throws IOException {
     }
   }
 
+  /**
+   * Read the Java-generated ALP-encoded arade parquet file and verify all values
+   * match the expected CSV.
+   */
+  @Test
+  public void testReadAlpJavaAradeParquet() throws IOException {
+    Path parquetPath = resourcePath("alp_java_arade.parquet");
+    String[] columnNames = {"value1", "value2", "value3", "value4"};
+    int expectedRows = 15000;
+
+    double[][] expected = readExpectedCsv("/alp_arade_expect.csv", columnNames.length, expectedRows);
+
+    List<Group> rows = readParquetGroups(parquetPath);
+    assertEquals("Row count should match", expectedRows, rows.size());
+
+    verifyAlpEncoding(parquetPath);
+
+    for (int r = 0; r < expectedRows; r++) {
+      Group group = rows.get(r);
+      for (int c = 0; c < columnNames.length; c++) {
+        double actual = group.getDouble(columnNames[c], 0);
+        assertEquals(
+            String.format("Mismatch at row %d, column %s", r, columnNames[c]),
+            Double.doubleToLongBits(expected[c][r]),
+            Double.doubleToLongBits(actual));
+      }
+    }
+  }
+
+  /**
+   * Read the Java-generated ALP-encoded spotify1 parquet file and verify all values
+   * match the expected CSV.
+   */
+  @Test
+  public void testReadAlpJavaSpotify1Parquet() throws IOException {
+    Path parquetPath = resourcePath("alp_java_spotify1.parquet");
+    String[] columnNames = {
+      "danceability",
+      "energy",
+      "loudness",
+      "speechiness",
+      "acousticness",
+      "instrumentalness",
+      "liveness",
+      "valence",
+      "tempo"
+    };
+    int expectedRows = 15000;
+
+    double[][] expected = readExpectedCsv("/alp_spotify1_expect.csv", columnNames.length, expectedRows);
+
+    List<Group> rows = readParquetGroups(parquetPath);
+    assertEquals("Row count should match", expectedRows, rows.size());
+
+    verifyAlpEncoding(parquetPath);
+
+    for (int r = 0; r < expectedRows; r++) {
+      Group group = rows.get(r);
+      for (int c = 0; c < columnNames.length; c++) {
+        double actual = group.getDouble(columnNames[c], 0);
+        assertEquals(
+            String.format("Mismatch at row %d, column %s", r, columnNames[c]),
+            Double.doubleToLongBits(expected[c][r]),
+            Double.doubleToLongBits(actual));
+      }
+    }
+  }
+
   private List<Group> readParquetGroups(Path path) throws IOException {
     List<Group> rows = new ArrayList<>();
     try (ParquetReader<Group> reader =
diff --git a/parquet-hadoop/src/test/resources/alp_java_arade.parquet b/parquet-hadoop/src/test/resources/alp_java_arade.parquet
diff --git a/parquet-hadoop/src/test/resources/alp_java_spotify1.parquet b/parquet-hadoop/src/test/resources/alp_java_spotify1.parquet