Skip to content

Commit 87b61a3

Browse files
committed
ALP: Add cross-implementation tests and fix encode/decode to match C++
Add AlpCrossImplTest with 7 test cases that decode C++ reference blobs and verify bit-identical output. Reference blobs were generated by the C++ Arrow ALP implementation via generate_reference_blobs.cc. Fix encode/decode math to use two-step multiplication matching C++: - Encode: value * 10^exponent * 10^(-factor) - Decode: encoded * 10^factor * 10^(-exponent) The previous single-operation approach (value / (10^e / 10^f)) produced 1-ULP differences due to different intermediate floating-point rounding.
1 parent 4800cf7 commit 87b61a3

3 files changed

Lines changed: 333 additions & 34 deletions

File tree

parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpConstants.java

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,12 @@ private AlpConstants() {
6969

7070
static final float[] FLOAT_POW10 = {1e0f, 1e1f, 1e2f, 1e3f, 1e4f, 1e5f, 1e6f, 1e7f, 1e8f, 1e9f, 1e10f};
7171

72+
// Negative powers of 10 as float, matching C++ PowerOfTenFloat(-power).
73+
// Used in the two-step encode/decode to match C++ floating-point rounding behavior.
74+
static final float[] FLOAT_POW10_NEGATIVE = {
75+
1e0f, 1e-1f, 1e-2f, 1e-3f, 1e-4f, 1e-5f, 1e-6f, 1e-7f, 1e-8f, 1e-9f, 1e-10f
76+
};
77+
7278
// ========== Double-specific ==========
7379
static final int DOUBLE_MAX_EXPONENT = 18;
7480
static final double MAGIC_DOUBLE = 6_755_399_441_055_744.0; // 2^51 + 2^52
@@ -80,6 +86,12 @@ private AlpConstants() {
8086
1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18
8187
};
8288

89+
// Negative powers of 10 as double, matching C++ PowerOfTenDouble(-power).
90+
static final double[] DOUBLE_POW10_NEGATIVE = {
91+
1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10,
92+
1e-11, 1e-12, 1e-13, 1e-14, 1e-15, 1e-16, 1e-17, 1e-18
93+
};
94+
8395
// ========== Per-vector metadata sizes ==========
8496
public static final int ALP_INFO_SIZE = 4; // exponent(1) + factor(1) + num_exceptions(2)
8597
public static final int FLOAT_FOR_INFO_SIZE = 5; // frame_of_reference(4) + bit_width(1)

parquet-column/src/main/java/org/apache/parquet/column/values/alp/AlpEncoderDecoder.java

Lines changed: 18 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
* then applying Frame of Reference encoding and bit-packing.
2828
* Values that cannot be losslessly converted are stored as exceptions.
2929
*
30-
* <p>Encoding formula: encoded = round(value * 10^exponent / 10^factor)
31-
* <p>Decoding formula: value = encoded / 10^exponent * 10^factor
30+
* <p>Encoding formula: encoded = round(value * 10^exponent * 10^(-factor))
31+
* <p>Decoding formula: value = encoded * 10^factor * 10^(-exponent)
3232
*
3333
* <p>Exception conditions:
3434
* <ul>
@@ -45,26 +45,6 @@ private AlpEncoderDecoder() {
4545
// Utility class
4646
}
4747

48-
// ========== Float multiplier ==========
49-
50-
static float getFloatMultiplier(int exponent, int factor) {
51-
float multiplier = FLOAT_POW10[exponent];
52-
if (factor > 0) {
53-
multiplier /= FLOAT_POW10[factor];
54-
}
55-
return multiplier;
56-
}
57-
58-
// ========== Double multiplier ==========
59-
60-
static double getDoubleMultiplier(int exponent, int factor) {
61-
double multiplier = DOUBLE_POW10[exponent];
62-
if (factor > 0) {
63-
multiplier /= DOUBLE_POW10[factor];
64-
}
65-
return multiplier;
66-
}
67-
6848
// ========== Float exception detection ==========
6949

7050
/** NaN, Inf, and -0.0 can never be encoded regardless of exponent/factor. */
@@ -83,9 +63,8 @@ static boolean isFloatException(float value, int exponent, int factor) {
8363
if (isFloatException(value)) {
8464
return true;
8565
}
86-
float multiplier = getFloatMultiplier(exponent, factor);
87-
float scaled = value * multiplier;
88-
if (scaled > Integer.MAX_VALUE || scaled < Integer.MIN_VALUE) {
66+
float scaled = value * FLOAT_POW10[exponent] * FLOAT_POW10_NEGATIVE[factor];
67+
if (scaled > FLOAT_ENCODING_UPPER_LIMIT || scaled < FLOAT_ENCODING_LOWER_LIMIT) {
8968
return true;
9069
}
9170
int encoded = encodeFloat(value, exponent, factor);
@@ -94,15 +73,18 @@ static boolean isFloatException(float value, int exponent, int factor) {
9473
}
9574

9675
// ========== Float encode/decode ==========
76+
// Two-step multiplication matching C++ to produce identical floating-point rounding.
77+
// C++ encode: value * 10^exponent * 10^(-factor)
78+
// C++ decode: (float)encoded * 10^factor * 10^(-exponent)
9779

98-
/** Encode: round(value * 10^exponent / 10^factor) */
80+
/** Encode: round(value * 10^exponent * 10^(-factor)) */
9981
static int encodeFloat(float value, int exponent, int factor) {
100-
return fastRoundFloat(value * getFloatMultiplier(exponent, factor));
82+
return fastRoundFloat(value * FLOAT_POW10[exponent] * FLOAT_POW10_NEGATIVE[factor]);
10183
}
10284

103-
/** Decode: encoded / 10^exponent * 10^factor */
85+
/** Decode: encoded * 10^factor * 10^(-exponent) */
10486
static float decodeFloat(int encoded, int exponent, int factor) {
105-
return encoded / getFloatMultiplier(exponent, factor);
87+
return (float) encoded * FLOAT_POW10[factor] * FLOAT_POW10_NEGATIVE[exponent];
10688
}
10789

10890
// Uses the 2^22+2^23 magic-number trick to round without branching on the FPU.
@@ -130,9 +112,8 @@ static boolean isDoubleException(double value, int exponent, int factor) {
130112
if (isDoubleException(value)) {
131113
return true;
132114
}
133-
double multiplier = getDoubleMultiplier(exponent, factor);
134-
double scaled = value * multiplier;
135-
if (scaled > Long.MAX_VALUE || scaled < Long.MIN_VALUE) {
115+
double scaled = value * DOUBLE_POW10[exponent] * DOUBLE_POW10_NEGATIVE[factor];
116+
if (scaled > DOUBLE_ENCODING_UPPER_LIMIT || scaled < DOUBLE_ENCODING_LOWER_LIMIT) {
136117
return true;
137118
}
138119
long encoded = encodeDouble(value, exponent, factor);
@@ -141,13 +122,16 @@ static boolean isDoubleException(double value, int exponent, int factor) {
141122
}
142123

143124
// ========== Double encode/decode ==========
125+
// Two-step multiplication matching C++ to produce identical floating-point rounding.
144126

127+
/** Encode: round(value * 10^exponent * 10^(-factor)) */
145128
static long encodeDouble(double value, int exponent, int factor) {
146-
return fastRoundDouble(value * getDoubleMultiplier(exponent, factor));
129+
return fastRoundDouble(value * DOUBLE_POW10[exponent] * DOUBLE_POW10_NEGATIVE[factor]);
147130
}
148131

132+
/** Decode: encoded * 10^factor * 10^(-exponent) */
149133
static double decodeDouble(long encoded, int exponent, int factor) {
150-
return encoded / getDoubleMultiplier(exponent, factor);
134+
return (double) encoded * DOUBLE_POW10[factor] * DOUBLE_POW10_NEGATIVE[exponent];
151135
}
152136

153137
// Same trick but with 2^51+2^52 for double precision.

0 commit comments

Comments
 (0)