apache
diff --git a/‎parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java‎
Lines changed: 221 additions & 0 deletions b/‎parquet-column/src/test/java/org/apache/parquet/column/values/alp/benchmark/AlpCodecThroughput.java‎
Lines changed: 221 additions & 0 deletions
diff --git a/‎parquet-hadoop/src/test/java/org/apache/parquet/hadoop/AlpDecompressionThroughput.java‎
Lines changed: 129 additions & 0 deletions b/‎parquet-hadoop/src/test/java/org/apache/parquet/hadoop/AlpDecompressionThroughput.java‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎parquet-hadoop/src/test/resources/.alp_java_arade.parquet.crc‎
1.39 KB b/‎parquet-hadoop/src/test/resources/.alp_java_arade.parquet.crc‎
1.39 KB
diff --git a/‎parquet-hadoop/src/test/resources/.alp_java_float_arade.parquet.crc‎
2.09 KB b/‎parquet-hadoop/src/test/resources/.alp_java_float_arade.parquet.crc‎
2.09 KB
diff --git a/‎parquet-hadoop/src/test/resources/.alp_java_float_spotify1.parquet.crc‎
2.58 KB b/‎parquet-hadoop/src/test/resources/.alp_java_float_spotify1.parquet.crc‎
2.58 KB
diff --git a/‎parquet-hadoop/src/test/resources/.alp_java_spotify1.parquet.crc‎
2.38 KB b/‎parquet-hadoop/src/test/resources/.alp_java_spotify1.parquet.crc‎
2.38 KB
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.column.values.alp.benchmark;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Random;
+import org.apache.parquet.bytes.ByteBufferInputStream;
+import org.apache.parquet.column.values.alp.AlpValuesReaderForDouble;
+import org.apache.parquet.column.values.alp.AlpValuesReaderForFloat;
+import org.apache.parquet.column.values.alp.AlpValuesWriter;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Codec-level ALP throughput benchmark reporting MB/s.
+ *
+ * <p>Comparable to C++ encoding_alp_benchmark.cc. Measures encode and decode
+ * throughput at the codec level (no Parquet pipeline overhead).
+ */
+public class AlpCodecThroughput {
+
+  private static final int NUM_VALUES = 1_000_000;
+  private static final int WARMUP = 10;
+  private static final int MEASURED = 30;
+
+  // Datasets
+  private static double[] doubleDecimal;
+  private static double[] doubleInteger;
+  private static double[] doubleMixed;
+  private static float[] floatDecimal;
+  private static float[] floatInteger;
+  private static float[] floatMixed;
+
+  // Pre-compressed
+  private static byte[] doubleDecimalComp;
+  private static byte[] doubleIntegerComp;
+  private static byte[] doubleMixedComp;
+  private static byte[] floatDecimalComp;
+  private static byte[] floatIntegerComp;
+  private static byte[] floatMixedComp;
+
+  @BeforeClass
+  public static void setup() throws IOException {
+    Random rng = new Random(42);
+
+    doubleDecimal = new double[NUM_VALUES];
+    for (int i = 0; i < NUM_VALUES; i++) {
+      doubleDecimal[i] = Math.round(rng.nextDouble() * 10000) / 100.0;
+    }
+
+    doubleInteger = new double[NUM_VALUES];
+    for (int i = 0; i < NUM_VALUES; i++) {
+      doubleInteger[i] = (double) rng.nextInt(100000);
+    }
+
+    doubleMixed = new double[NUM_VALUES];
+    for (int i = 0; i < NUM_VALUES; i++) {
+      doubleMixed[i] = Math.round(rng.nextDouble() * 10000) / 100.0;
+    }
+    for (int i = 0; i < NUM_VALUES; i += 50) {
+      doubleMixed[i] = Double.NaN;
+    }
+
+    floatDecimal = new float[NUM_VALUES];
+    for (int i = 0; i < NUM_VALUES; i++) {
+      floatDecimal[i] = Math.round(rng.nextFloat() * 10000) / 100.0f;
+    }
+
+    floatInteger = new float[NUM_VALUES];
+    for (int i = 0; i < NUM_VALUES; i++) {
+      floatInteger[i] = (float) rng.nextInt(100000);
+    }
+
+    floatMixed = new float[NUM_VALUES];
+    for (int i = 0; i < NUM_VALUES; i++) {
+      floatMixed[i] = Math.round(rng.nextFloat() * 10000) / 100.0f;
+    }
+    for (int i = 0; i < NUM_VALUES; i += 50) {
+      floatMixed[i] = Float.NaN;
+    }
+
+    doubleDecimalComp = compressDoubles(doubleDecimal);
+    doubleIntegerComp = compressDoubles(doubleInteger);
+    doubleMixedComp = compressDoubles(doubleMixed);
+    floatDecimalComp = compressFloats(floatDecimal);
+    floatIntegerComp = compressFloats(floatInteger);
+    floatMixedComp = compressFloats(floatMixed);
+  }
+
+  @Test
+  public void measureThroughput() throws IOException {
+    System.out.println();
+    System.out.println("=== ALP Codec-Level Throughput (1M values) ===");
+    System.out.printf("%-30s %10s %10s %10s %10s%n",
+        "Dataset", "Enc MB/s", "Dec MB/s", "Raw KB", "Comp KB");
+    System.out.println("------------------------------" +
+        " ---------- ---------- ---------- ----------");
+
+    benchDouble("double_decimal", doubleDecimal, doubleDecimalComp);
+    benchDouble("double_integer", doubleInteger, doubleIntegerComp);
+    benchDouble("double_mixed(2%exc)", doubleMixed, doubleMixedComp);
+    benchFloat("float_decimal", floatDecimal, floatDecimalComp);
+    benchFloat("float_integer", floatInteger, floatIntegerComp);
+    benchFloat("float_mixed(2%exc)", floatMixed, floatMixedComp);
+
+    System.out.println();
+  }
+
+  private void benchDouble(String name, double[] data, byte[] compressed) throws IOException {
+    long rawBytes = (long) data.length * Double.BYTES;
+
+    // Warmup encode
+    for (int i = 0; i < WARMUP; i++) {
+      compressDoubles(data);
+    }
+    // Measure encode
+    long encNanos = 0;
+    for (int i = 0; i < MEASURED; i++) {
+      long t0 = System.nanoTime();
+      compressDoubles(data);
+      encNanos += System.nanoTime() - t0;
+    }
+
+    // Warmup decode
+    for (int i = 0; i < WARMUP; i++) {
+      decompressDoubles(compressed, data.length);
+    }
+    // Measure decode
+    long decNanos = 0;
+    for (int i = 0; i < MEASURED; i++) {
+      long t0 = System.nanoTime();
+      decompressDoubles(compressed, data.length);
+      decNanos += System.nanoTime() - t0;
+    }
+
+    double encMBps = (rawBytes * MEASURED / (encNanos / 1e9)) / (1024.0 * 1024.0);
+    double decMBps = (rawBytes * MEASURED / (decNanos / 1e9)) / (1024.0 * 1024.0);
+
+    System.out.printf("%-30s %10.1f %10.1f %10d %10d%n",
+        name, encMBps, decMBps, rawBytes / 1024, compressed.length / 1024);
+  }
+
+  private void benchFloat(String name, float[] data, byte[] compressed) throws IOException {
+    long rawBytes = (long) data.length * Float.BYTES;
+
+    for (int i = 0; i < WARMUP; i++) {
+      compressFloats(data);
+    }
+    long encNanos = 0;
+    for (int i = 0; i < MEASURED; i++) {
+      long t0 = System.nanoTime();
+      compressFloats(data);
+      encNanos += System.nanoTime() - t0;
+    }
+
+    for (int i = 0; i < WARMUP; i++) {
+      decompressFloats(compressed, data.length);
+    }
+    long decNanos = 0;
+    for (int i = 0; i < MEASURED; i++) {
+      long t0 = System.nanoTime();
+      decompressFloats(compressed, data.length);
+      decNanos += System.nanoTime() - t0;
+    }
+
+    double encMBps = (rawBytes * MEASURED / (encNanos / 1e9)) / (1024.0 * 1024.0);
+    double decMBps = (rawBytes * MEASURED / (decNanos / 1e9)) / (1024.0 * 1024.0);
+
+    System.out.printf("%-30s %10.1f %10.1f %10d %10d%n",
+        name, encMBps, decMBps, rawBytes / 1024, compressed.length / 1024);
+  }
+
+  private static byte[] compressDoubles(double[] values) throws IOException {
+    AlpValuesWriter.DoubleAlpValuesWriter writer = new AlpValuesWriter.DoubleAlpValuesWriter();
+    for (double v : values) {
+      writer.writeDouble(v);
+    }
+    return writer.getBytes().toByteArray();
+  }
+
+  private static byte[] compressFloats(float[] values) throws IOException {
+    AlpValuesWriter.FloatAlpValuesWriter writer = new AlpValuesWriter.FloatAlpValuesWriter();
+    for (float v : values) {
+      writer.writeFloat(v);
+    }
+    return writer.getBytes().toByteArray();
+  }
+
+  private static void decompressDoubles(byte[] compressed, int numValues) throws IOException {
+    AlpValuesReaderForDouble reader = new AlpValuesReaderForDouble();
+    reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed)));
+    for (int i = 0; i < numValues; i++) {
+      reader.readDouble();
+    }
+  }
+
+  private static void decompressFloats(byte[] compressed, int numValues) throws IOException {
+    AlpValuesReaderForFloat reader = new AlpValuesReaderForFloat();
+    reader.initFromPage(numValues, ByteBufferInputStream.wrap(ByteBuffer.wrap(compressed)));
+    for (int i = 0; i < numValues; i++) {
+      reader.readFloat();
+    }
+  }
+}
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.parquet.hadoop;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.hadoop.example.GroupReadSupport;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.junit.Test;
+
+/**
+ * Measures ALP decompression throughput in bytes/second for real-world datasets.
+ * Reports raw (uncompressed) bytes/second and compressed bytes/second.
+ */
+public class AlpDecompressionThroughput {
+
+  private static final int WARMUP_ITERS = 5;
+  private static final int MEASURED_ITERS = 20;
+
+  private static Path resourcePath(String name) {
+    try {
+      return new Path(AlpDecompressionThroughput.class.getResource("/" + name).toURI());
+    } catch (URISyntaxException e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Test
+  public void measureDecompressionThroughput() throws IOException {
+    System.out.println();
+    System.out.println("=== ALP Decompression Throughput ===");
+    System.out.printf(
+        "%-40s %8s %6s %6s %12s %12s %12s%n",
+        "File", "Rows", "Cols", "Type", "Compressed", "Raw MB/s", "Comp MB/s");
+    System.out.println(
+        "---------------------------------------- -------- ------ ------ ------------ ------------ ------------");
+
+    // Double datasets
+    benchmarkFile("alp_arade.parquet", 15000, 4, "double", 8);
+    benchmarkFile("alp_spotify1.parquet", 15000, 9, "double", 8);
+    benchmarkFile("alp_java_arade.parquet", 15000, 4, "double", 8);
+    benchmarkFile("alp_java_spotify1.parquet", 15000, 9, "double", 8);
+
+    // Float datasets
+    benchmarkFile("alp_float_arade.parquet", 15000, 4, "float", 4);
+    benchmarkFile("alp_float_spotify1.parquet", 15000, 9, "float", 4);
+    benchmarkFile("alp_java_float_arade.parquet", 15000, 4, "float", 4);
+    benchmarkFile("alp_java_float_spotify1.parquet", 15000, 9, "float", 4);
+
+    System.out.println();
+  }
+
+  private void benchmarkFile(String fileName, int expectedRows, int numCols, String type, int bytesPerValue)
+      throws IOException {
+    Path path = resourcePath(fileName);
+
+    // Get compressed file size from parquet metadata
+    long compressedSize = 0;
+    try (ParquetFileReader pfr = ParquetFileReader.open(
+        org.apache.parquet.hadoop.util.HadoopInputFile.fromPath(
+            path, new org.apache.hadoop.conf.Configuration()))) {
+      ParquetMetadata footer = pfr.getFooter();
+      for (org.apache.parquet.hadoop.metadata.BlockMetaData block : footer.getBlocks()) {
+        compressedSize += block.getTotalByteSize();
+      }
+    }
+
+    long rawBytes = (long) expectedRows * numCols * bytesPerValue;
+
+    // Warmup
+    for (int i = 0; i < WARMUP_ITERS; i++) {
+      readAllValues(path, type, numCols);
+    }
+
+    // Measured runs
+    long totalNanos = 0;
+    for (int i = 0; i < MEASURED_ITERS; i++) {
+      long start = System.nanoTime();
+      readAllValues(path, type, numCols);
+      totalNanos += System.nanoTime() - start;
+    }
+
+    double avgSeconds = (totalNanos / (double) MEASURED_ITERS) / 1_000_000_000.0;
+    double rawMBps = (rawBytes / avgSeconds) / (1024.0 * 1024.0);
+    double compMBps = (compressedSize / avgSeconds) / (1024.0 * 1024.0);
+
+    System.out.printf(
+        "%-40s %8d %6d %6s %12d %10.1f %10.1f%n",
+        fileName, expectedRows, numCols, type, compressedSize, rawMBps, compMBps);
+  }
+
+  private void readAllValues(Path path, String type, int numCols) throws IOException {
+    try (ParquetReader<Group> reader =
+        ParquetReader.builder(new GroupReadSupport(), path).build()) {
+      Group group;
+      if ("double".equals(type)) {
+        while ((group = reader.read()) != null) {
+          for (int c = 0; c < numCols; c++) {
+            group.getDouble(c, 0);
+          }
+        }
+      } else {
+        while ((group = reader.read()) != null) {
+          for (int c = 0; c < numCols; c++) {
+            group.getFloat(c, 0);
+          }
+        }
+      }
+    }
+  }
+}