Skip to content

Commit 0c393e0

Browse files
committed
first pass of ALP java implementation
1 parent 03457c5 commit 0c393e0

11 files changed

Lines changed: 1887 additions & 1610 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/ParquetProperties.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ public class ParquetProperties {
5050
public static final int DEFAULT_DICTIONARY_PAGE_SIZE = DEFAULT_PAGE_SIZE;
5151
public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = true;
5252
public static final boolean DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED = false;
53+
public static final boolean DEFAULT_IS_ALP_ENABLED = false;
5354
public static final WriterVersion DEFAULT_WRITER_VERSION = WriterVersion.PARQUET_1_0;
5455
public static final boolean DEFAULT_ESTIMATE_ROW_COUNT_FOR_PAGE_SIZE_CHECK = true;
5556
public static final int DEFAULT_MINIMUM_RECORD_COUNT_FOR_CHECK = 100;
@@ -132,6 +133,7 @@ public static WriterVersion fromString(String name) {
132133
private final int pageRowCountLimit;
133134
private final boolean pageWriteChecksumEnabled;
134135
private final ColumnProperty<ByteStreamSplitMode> byteStreamSplitEnabled;
136+
private final ColumnProperty<Boolean> alpEnabled;
135137
private final Map<String, String> extraMetaData;
136138
private final ColumnProperty<Boolean> statistics;
137139
private final ColumnProperty<Boolean> sizeStatistics;
@@ -164,6 +166,7 @@ private ParquetProperties(Builder builder) {
164166
this.pageRowCountLimit = builder.pageRowCountLimit;
165167
this.pageWriteChecksumEnabled = builder.pageWriteChecksumEnabled;
166168
this.byteStreamSplitEnabled = builder.byteStreamSplitEnabled.build();
169+
this.alpEnabled = builder.alpEnabled.build();
167170
this.extraMetaData = builder.extraMetaData;
168171
this.statistics = builder.statistics.build();
169172
this.sizeStatistics = builder.sizeStatistics.build();
@@ -259,6 +262,23 @@ public boolean isByteStreamSplitEnabled(ColumnDescriptor column) {
259262
}
260263
}
261264

265+
/**
266+
* Check if ALP encoding is enabled for the given column.
267+
* ALP encoding is only supported for FLOAT and DOUBLE types.
268+
*
269+
* @param column the column descriptor
270+
* @return true if ALP encoding is enabled for this column
271+
*/
272+
public boolean isAlpEnabled(ColumnDescriptor column) {
273+
switch (column.getPrimitiveType().getPrimitiveTypeName()) {
274+
case FLOAT:
275+
case DOUBLE:
276+
return alpEnabled.getValue(column);
277+
default:
278+
return false;
279+
}
280+
}
281+
262282
public ByteBufferAllocator getAllocator() {
263283
return allocator;
264284
}
@@ -416,6 +436,7 @@ public static class Builder {
416436
private int pageRowCountLimit = DEFAULT_PAGE_ROW_COUNT_LIMIT;
417437
private boolean pageWriteChecksumEnabled = DEFAULT_PAGE_WRITE_CHECKSUM_ENABLED;
418438
private final ColumnProperty.Builder<ByteStreamSplitMode> byteStreamSplitEnabled;
439+
private final ColumnProperty.Builder<Boolean> alpEnabled;
419440
private Map<String, String> extraMetaData = new HashMap<>();
420441
private final ColumnProperty.Builder<Boolean> statistics;
421442
private final ColumnProperty.Builder<Boolean> sizeStatistics;
@@ -427,6 +448,7 @@ private Builder() {
427448
DEFAULT_IS_BYTE_STREAM_SPLIT_ENABLED
428449
? ByteStreamSplitMode.FLOATING_POINT
429450
: ByteStreamSplitMode.NONE);
451+
alpEnabled = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_IS_ALP_ENABLED);
430452
bloomFilterEnabled = ColumnProperty.<Boolean>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_ENABLED);
431453
bloomFilterNDVs = ColumnProperty.<Long>builder().withDefaultValue(null);
432454
bloomFilterFPPs = ColumnProperty.<Double>builder().withDefaultValue(DEFAULT_BLOOM_FILTER_FPP);
@@ -457,6 +479,7 @@ private Builder(ParquetProperties toCopy) {
457479
this.numBloomFilterCandidates = ColumnProperty.builder(toCopy.numBloomFilterCandidates);
458480
this.maxBloomFilterBytes = toCopy.maxBloomFilterBytes;
459481
this.byteStreamSplitEnabled = ColumnProperty.builder(toCopy.byteStreamSplitEnabled);
482+
this.alpEnabled = ColumnProperty.builder(toCopy.alpEnabled);
460483
this.extraMetaData = toCopy.extraMetaData;
461484
this.statistics = ColumnProperty.builder(toCopy.statistics);
462485
this.sizeStatistics = ColumnProperty.builder(toCopy.sizeStatistics);
@@ -534,6 +557,29 @@ public Builder withExtendedByteStreamSplitEncoding(boolean enable) {
534557
return this;
535558
}
536559

560+
/**
561+
* Enable or disable ALP encoding for FLOAT and DOUBLE columns.
562+
*
563+
* @param enable whether ALP encoding should be enabled
564+
* @return this builder for method chaining.
565+
*/
566+
public Builder withAlpEncoding(boolean enable) {
567+
this.alpEnabled.withDefaultValue(enable);
568+
return this;
569+
}
570+
571+
/**
572+
* Enable or disable ALP encoding for the specified column.
573+
*
574+
* @param columnPath the path of the column (dot-string)
575+
* @param enable whether ALP encoding should be enabled
576+
* @return this builder for method chaining.
577+
*/
578+
public Builder withAlpEncoding(String columnPath, boolean enable) {
579+
this.alpEnabled.withValue(columnPath, enable);
580+
return this;
581+
}
582+
537583
/**
538584
* Set the Parquet format dictionary page size.
539585
*

parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@
1818
*/
1919
package org.apache.parquet.column.values.alp;
2020

21+
import org.apache.parquet.Preconditions;
22+
2123
/**
2224
* Constants for the ALP (Adaptive Lossless floating-Point) encoding.
2325
*
2426
* <p>ALP encoding converts floating-point values to integers using decimal scaling,
25-
* then applies Frame of Reference (FOR) encoding and bit-packing.
27+
* then applies Frame of Reference encoding and bit-packing.
2628
* Values that cannot be losslessly converted are stored as exceptions.
2729
*
2830
* <p>Based on the paper: "ALP: Adaptive Lossless floating-Point Compression" (SIGMOD 2024)
@@ -43,61 +45,55 @@ private AlpConstants() {
4345
/** ALP compression mode identifier (0 = ALP) */
4446
public static final int ALP_COMPRESSION_MODE = 0;
4547

46-
/** FOR encoding for integers (0 = FOR) */
48+
/** Frame of Reference encoding for integers (0 = Frame of Reference) */
4749
public static final int ALP_INTEGER_ENCODING_FOR = 0;
4850

4951
/** Size of the ALP page header in bytes */
5052
public static final int ALP_HEADER_SIZE = 8;
5153

52-
// ========== Vector Constants ==========
54+
// ========== Vector Size Constants ==========
5355

5456
/** Default number of elements per compressed vector (2^10 = 1024) */
55-
public static final int ALP_VECTOR_SIZE = 1024;
57+
public static final int DEFAULT_VECTOR_SIZE = 1024;
5658

5759
/** Log2 of the default vector size */
58-
public static final int ALP_VECTOR_SIZE_LOG = 10;
60+
public static final int DEFAULT_VECTOR_SIZE_LOG = 10;
61+
62+
/** Maximum allowed log2 of vector size */
63+
static final int MAX_LOG_VECTOR_SIZE = 16;
64+
65+
/** Minimum allowed log2 of vector size */
66+
static final int MIN_LOG_VECTOR_SIZE = 3;
5967

6068
// ========== Exponent/Factor Limits ==========
6169

6270
/** Maximum exponent for float encoding (10^10 ~ 10 billion) */
63-
public static final int FLOAT_MAX_EXPONENT = 10;
71+
static final int FLOAT_MAX_EXPONENT = 10;
6472

6573
/** Maximum exponent for double encoding (10^18 ~ 1 quintillion) */
66-
public static final int DOUBLE_MAX_EXPONENT = 18;
67-
68-
/** Number of (exponent, factor) combinations for float: sum(1..11) = 66 */
69-
public static final int FLOAT_COMBINATIONS = 66;
70-
71-
/** Number of (exponent, factor) combinations for double: sum(1..19) = 190 */
72-
public static final int DOUBLE_COMBINATIONS = 190;
74+
static final int DOUBLE_MAX_EXPONENT = 18;
7375

7476
// ========== Sampling Constants ==========
7577

76-
/** Number of values sampled per vector */
77-
public static final int SAMPLER_SAMPLES_PER_VECTOR = 256;
78-
79-
/** Number of sample vectors per rowgroup */
80-
public static final int SAMPLER_SAMPLE_VECTORS_PER_ROWGROUP = 8;
78+
/** Number of sample vectors used for preset caching */
79+
static final int SAMPLER_SAMPLE_VECTORS = 8;
8180

8281
/** Maximum (exponent, factor) combinations to keep in preset */
83-
public static final int MAX_COMBINATIONS = 5;
84-
85-
/** Stop sampling if this many consecutive combinations produce worse results */
86-
public static final int EARLY_EXIT_THRESHOLD = 4;
82+
static final int MAX_PRESET_COMBINATIONS = 5;
8783

8884
// ========== Fast Rounding Magic Numbers ==========
8985

9086
/**
9187
* Magic number for fast float rounding using the floating-point trick.
9288
* Formula: 2^22 + 2^23 = 12,582,912
9389
*/
94-
public static final float MAGIC_FLOAT = 12_582_912.0f;
90+
static final float MAGIC_FLOAT = 12_582_912.0f;
9591

9692
/**
9793
* Magic number for fast double rounding using the floating-point trick.
9894
* Formula: 2^51 + 2^52 = 6,755,399,441,055,744
9995
*/
100-
public static final double MAGIC_DOUBLE = 6_755_399_441_055_744.0;
96+
static final double MAGIC_DOUBLE = 6_755_399_441_055_744.0;
10197

10298
// ========== Metadata Sizes ==========
10399

@@ -113,24 +109,43 @@ private AlpConstants() {
113109
// ========== Precomputed Powers of 10 ==========
114110

115111
/** Precomputed powers of 10 for float encoding (10^0 to 10^10) */
116-
public static final float[] FLOAT_POW10 = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
112+
static final float[] FLOAT_POW10 = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
117113

118114
/** Precomputed powers of 10 for double encoding (10^0 to 10^18) */
119-
public static final double[] DOUBLE_POW10 = {
115+
static final double[] DOUBLE_POW10 = {
120116
1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18
121117
};
122118

123-
/** Precomputed negative powers of 10 for decoding (10^0 to 10^-18) */
124-
public static final double[] DOUBLE_POW10_NEG = {
125-
1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 1e-14, 1e-15, 1e-16,
126-
1e-17, 1e-18
127-
};
128-
129119
// ========== Bit Masks for Negative Zero Detection ==========
130120

131121
/** Bit pattern for negative zero in float */
132-
public static final int FLOAT_NEGATIVE_ZERO_BITS = 0x80000000;
122+
static final int FLOAT_NEGATIVE_ZERO_BITS = 0x80000000;
133123

134124
/** Bit pattern for negative zero in double */
135-
public static final long DOUBLE_NEGATIVE_ZERO_BITS = 0x8000000000000000L;
125+
static final long DOUBLE_NEGATIVE_ZERO_BITS = 0x8000000000000000L;
126+
127+
// ========== Validation ==========
128+
129+
/**
130+
* Validate that a vector size is a power of 2 and within the allowed range.
131+
*
132+
* @param vectorSize the vector size to validate
133+
* @return the validated vector size
134+
* @throws IllegalArgumentException if the vector size is invalid
135+
*/
136+
static int validateVectorSize(int vectorSize) {
137+
Preconditions.checkArgument(
138+
vectorSize > 0 && (vectorSize & (vectorSize - 1)) == 0,
139+
"Vector size must be a power of 2, got: %s",
140+
vectorSize);
141+
int logSize = Integer.numberOfTrailingZeros(vectorSize);
142+
Preconditions.checkArgument(
143+
logSize >= MIN_LOG_VECTOR_SIZE && logSize <= MAX_LOG_VECTOR_SIZE,
144+
"Vector size log2 must be between %s and %s, got: %s (vectorSize=%s)",
145+
MIN_LOG_VECTOR_SIZE,
146+
MAX_LOG_VECTOR_SIZE,
147+
logSize,
148+
vectorSize);
149+
return vectorSize;
150+
}
136151
}

0 commit comments

Comments
 (0)