Skip to content

Commit c7c798b

Browse files
committed
ALP: Add float32 cross-language tests and generator support
Add float32 (FLOAT) coverage to the ALP encoding interop tests: - Generator: generateAlpParquetFloat() writes float32 ALP parquet from float expect CSVs using PrimitiveTypeName.FLOAT schema - Tests: readExpectedCsvFloat() and 4 new test methods using Float.floatToIntBits() for bit-exact verification of C++ and Java generated float parquets for spotify1 and arade datasets - Add *.csv to RAT license check exclusions - Add float32 test resources (expect CSVs, C++ and Java parquets)
1 parent e358c3c commit c7c798b

9 files changed

Lines changed: 30232 additions & 0 deletions

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/GenerateAlpParquet.java

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,14 @@ public static void main(String[] args) throws IOException {
5555

5656
generateAlpParquet("/alp_spotify1_expect.csv", outputDir + "/alp_java_spotify1.parquet");
5757
System.out.println("Generated: " + outputDir + "/alp_java_spotify1.parquet");
58+
59+
generateAlpParquetFloat(
60+
"/alp_float_arade_expect.csv", outputDir + "/alp_java_float_arade.parquet");
61+
System.out.println("Generated: " + outputDir + "/alp_java_float_arade.parquet");
62+
63+
generateAlpParquetFloat(
64+
"/alp_float_spotify1_expect.csv", outputDir + "/alp_java_float_spotify1.parquet");
65+
System.out.println("Generated: " + outputDir + "/alp_java_float_spotify1.parquet");
5866
}
5967

6068
private static void generateAlpParquet(String csvResource, String outputPath) throws IOException {
@@ -111,4 +119,60 @@ private static void generateAlpParquet(String csvResource, String outputPath) th
111119
}
112120
}
113121
}
122+
123+
private static void generateAlpParquetFloat(String csvResource, String outputPath)
124+
throws IOException {
125+
// Read CSV into float values
126+
String[] columnNames;
127+
List<float[]> rows = new ArrayList<>();
128+
129+
try (InputStream is = GenerateAlpParquet.class.getResourceAsStream(csvResource);
130+
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
131+
// Parse header
132+
String header = br.readLine();
133+
columnNames = header.split(",");
134+
135+
// Parse data rows
136+
String line;
137+
while ((line = br.readLine()) != null) {
138+
String[] parts = line.split(",");
139+
float[] values = new float[parts.length];
140+
for (int i = 0; i < parts.length; i++) {
141+
values[i] = Float.parseFloat(parts[i]);
142+
}
143+
rows.add(values);
144+
}
145+
}
146+
147+
// Build schema: all required FLOAT columns
148+
Types.MessageTypeBuilder schemaBuilder = Types.buildMessage();
149+
for (String name : columnNames) {
150+
schemaBuilder.required(PrimitiveTypeName.FLOAT).named(name);
151+
}
152+
MessageType schema = schemaBuilder.named("schema");
153+
154+
// Delete output file if it exists
155+
java.io.File outFile = new java.io.File(outputPath);
156+
if (outFile.exists()) {
157+
outFile.delete();
158+
}
159+
160+
// Write ALP-encoded parquet
161+
Path path = new Path(outFile.getAbsolutePath());
162+
SimpleGroupFactory groupFactory = new SimpleGroupFactory(schema);
163+
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
164+
.withType(schema)
165+
.withWriterVersion(org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0)
166+
.withAlpEncoding(true)
167+
.withDictionaryEncoding(false)
168+
.build()) {
169+
for (float[] row : rows) {
170+
Group group = groupFactory.newGroup();
171+
for (int c = 0; c < columnNames.length; c++) {
172+
group.append(columnNames[c], row[c]);
173+
}
174+
writer.write(group);
175+
}
176+
}
177+
}
114178
}

parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestInteropAlpEncoding.java

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,142 @@ public void testReadAlpJavaSpotify1Parquet() throws IOException {
199199
}
200200
}
201201

202+
/**
203+
* Read the ALP-encoded float32 arade parquet file (C++ generated)
204+
* and verify all values match the expected CSV.
205+
*/
206+
@Test
207+
public void testReadAlpFloatAradeParquet() throws IOException {
208+
Path parquetPath = resourcePath("alp_float_arade.parquet");
209+
String[] columnNames = {"value1", "value2", "value3", "value4"};
210+
int expectedRows = 15000;
211+
212+
float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv", columnNames.length, expectedRows);
213+
214+
List<Group> rows = readParquetGroups(parquetPath);
215+
assertEquals("Row count should match", expectedRows, rows.size());
216+
217+
verifyAlpEncoding(parquetPath);
218+
219+
for (int r = 0; r < expectedRows; r++) {
220+
Group group = rows.get(r);
221+
for (int c = 0; c < columnNames.length; c++) {
222+
float actual = group.getFloat(columnNames[c], 0);
223+
assertEquals(
224+
String.format("Mismatch at row %d, column %s", r, columnNames[c]),
225+
Float.floatToIntBits(expected[c][r]),
226+
Float.floatToIntBits(actual));
227+
}
228+
}
229+
}
230+
231+
/**
232+
* Read the ALP-encoded float32 spotify1 parquet file (C++ generated)
233+
* and verify all values match the expected CSV.
234+
*/
235+
@Test
236+
public void testReadAlpFloatSpotify1Parquet() throws IOException {
237+
Path parquetPath = resourcePath("alp_float_spotify1.parquet");
238+
String[] columnNames = {
239+
"danceability",
240+
"energy",
241+
"loudness",
242+
"speechiness",
243+
"acousticness",
244+
"instrumentalness",
245+
"liveness",
246+
"valence",
247+
"tempo"
248+
};
249+
int expectedRows = 15000;
250+
251+
float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv", columnNames.length, expectedRows);
252+
253+
List<Group> rows = readParquetGroups(parquetPath);
254+
assertEquals("Row count should match", expectedRows, rows.size());
255+
256+
verifyAlpEncoding(parquetPath);
257+
258+
for (int r = 0; r < expectedRows; r++) {
259+
Group group = rows.get(r);
260+
for (int c = 0; c < columnNames.length; c++) {
261+
float actual = group.getFloat(columnNames[c], 0);
262+
assertEquals(
263+
String.format("Mismatch at row %d, column %s", r, columnNames[c]),
264+
Float.floatToIntBits(expected[c][r]),
265+
Float.floatToIntBits(actual));
266+
}
267+
}
268+
}
269+
270+
/**
271+
* Read the Java-generated ALP-encoded float32 arade parquet file
272+
* and verify all values match the expected CSV.
273+
*/
274+
@Test
275+
public void testReadAlpJavaFloatAradeParquet() throws IOException {
276+
Path parquetPath = resourcePath("alp_java_float_arade.parquet");
277+
String[] columnNames = {"value1", "value2", "value3", "value4"};
278+
int expectedRows = 15000;
279+
280+
float[][] expected = readExpectedCsvFloat("/alp_float_arade_expect.csv", columnNames.length, expectedRows);
281+
282+
List<Group> rows = readParquetGroups(parquetPath);
283+
assertEquals("Row count should match", expectedRows, rows.size());
284+
285+
verifyAlpEncoding(parquetPath);
286+
287+
for (int r = 0; r < expectedRows; r++) {
288+
Group group = rows.get(r);
289+
for (int c = 0; c < columnNames.length; c++) {
290+
float actual = group.getFloat(columnNames[c], 0);
291+
assertEquals(
292+
String.format("Mismatch at row %d, column %s", r, columnNames[c]),
293+
Float.floatToIntBits(expected[c][r]),
294+
Float.floatToIntBits(actual));
295+
}
296+
}
297+
}
298+
299+
/**
300+
* Read the Java-generated ALP-encoded float32 spotify1 parquet file
301+
* and verify all values match the expected CSV.
302+
*/
303+
@Test
304+
public void testReadAlpJavaFloatSpotify1Parquet() throws IOException {
305+
Path parquetPath = resourcePath("alp_java_float_spotify1.parquet");
306+
String[] columnNames = {
307+
"danceability",
308+
"energy",
309+
"loudness",
310+
"speechiness",
311+
"acousticness",
312+
"instrumentalness",
313+
"liveness",
314+
"valence",
315+
"tempo"
316+
};
317+
int expectedRows = 15000;
318+
319+
float[][] expected = readExpectedCsvFloat("/alp_float_spotify1_expect.csv", columnNames.length, expectedRows);
320+
321+
List<Group> rows = readParquetGroups(parquetPath);
322+
assertEquals("Row count should match", expectedRows, rows.size());
323+
324+
verifyAlpEncoding(parquetPath);
325+
326+
for (int r = 0; r < expectedRows; r++) {
327+
Group group = rows.get(r);
328+
for (int c = 0; c < columnNames.length; c++) {
329+
float actual = group.getFloat(columnNames[c], 0);
330+
assertEquals(
331+
String.format("Mismatch at row %d, column %s", r, columnNames[c]),
332+
Float.floatToIntBits(expected[c][r]),
333+
Float.floatToIntBits(actual));
334+
}
335+
}
336+
}
337+
202338
private List<Group> readParquetGroups(Path path) throws IOException {
203339
List<Group> rows = new ArrayList<>();
204340
try (ParquetReader<Group> reader =
@@ -254,4 +390,33 @@ private double[][] readExpectedCsv(String resourcePath, int numColumns, int expe
254390
}
255391
return columns;
256392
}
393+
394+
/**
395+
* Parse expected CSV into float column arrays.
396+
* CSV format: header row, then data rows with comma-separated float values.
397+
*/
398+
private float[][] readExpectedCsvFloat(String resourcePath, int numColumns, int expectedRows) throws IOException {
399+
float[][] columns = new float[numColumns][expectedRows];
400+
try (InputStream is = getClass().getResourceAsStream(resourcePath);
401+
BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
402+
assertNotNull("CSV resource not found: " + resourcePath, is);
403+
404+
// Skip header
405+
String header = br.readLine();
406+
assertNotNull("CSV should have a header", header);
407+
408+
int row = 0;
409+
String line;
410+
while ((line = br.readLine()) != null) {
411+
String[] parts = line.split(",");
412+
assertEquals("CSV row " + row + " should have " + numColumns + " columns", numColumns, parts.length);
413+
for (int c = 0; c < numColumns; c++) {
414+
columns[c][row] = Float.parseFloat(parts[c]);
415+
}
416+
row++;
417+
}
418+
assertEquals("CSV should have " + expectedRows + " data rows", expectedRows, row);
419+
}
420+
return columns;
421+
}
257422
}
265 KB
Binary file not shown.

0 commit comments

Comments
 (0)