use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testUseStatsWithSignedSortOrder.
private void testUseStatsWithSignedSortOrder(StatsHelper helper) {
// override defaults and use stats that were accumulated using signed order
Configuration conf = new Configuration();
conf.setBoolean("parquet.strings.signed-min-max.enabled", true);
ParquetMetadataConverter converter = new ParquetMetadataConverter(conf);
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("z"));
stats.incrementNumNulls();
PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, helper.toParquetStatistics(stats), binaryType);
Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
Assert.assertTrue(convertedStats.isNumNullsSet());
Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
if (helper == StatsHelper.V1) {
assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
} else {
Assert.assertEquals("Should have correct min (unsigned sort)", Binary.fromString("A"), convertedStats.genericGetMin());
Assert.assertEquals("Should have correct max (unsigned sort)", Binary.fromString("z"), convertedStats.genericGetMax());
}
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testBinaryStats.
private void testBinaryStats(StatsHelper helper) {
// make fake stats and verify the size check
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls(3004);
byte[] min = new byte[904];
byte[] max = new byte[2388];
stats.updateStats(Binary.fromConstantByteArray(min));
stats.updateStats(Binary.fromConstantByteArray(max));
long totalLen = min.length + max.length;
Assert.assertFalse("Should not be smaller than min + max size", stats.isSmallerThan(totalLen));
Assert.assertTrue("Should be smaller than min + max size + 1", stats.isSmallerThan(totalLen + 1));
org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);
assertFalse("Min should not be set", formatStats.isSetMin());
assertFalse("Max should not be set", formatStats.isSetMax());
if (helper == StatsHelper.V2) {
Assert.assertArrayEquals("Min_value should match", min, formatStats.getMin_value());
Assert.assertArrayEquals("Max_value should match", max, formatStats.getMax_value());
}
Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count());
// convert to empty stats because the values are too large
stats.setMinMaxFromBytes(max, max);
formatStats = helper.toParquetStatistics(stats);
Assert.assertFalse("Min should not be set", formatStats.isSetMin());
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
Assert.assertFalse("Min_value should not be set", formatStats.isSetMin_value());
Assert.assertFalse("Max_value should not be set", formatStats.isSetMax_value());
Assert.assertFalse("Num nulls should not be set", formatStats.isSetNull_count());
Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatisticsInternal(Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""), ParquetMetadataConverter.SortOrder.SIGNED);
Assert.assertTrue(roundTripStats.isEmpty());
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testStillUseStatsWithSignedSortOrderIfSingleValue.
private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) {
ParquetMetadataConverter converter = new ParquetMetadataConverter();
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, ParquetMetadataConverter.toParquetStatistics(stats), binaryType);
Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty());
Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes());
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testIgnoreStatsWithSignedSortOrder.
@Test
public void testIgnoreStatsWithSignedSortOrder() {
ParquetMetadataConverter converter = new ParquetMetadataConverter();
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("z"));
stats.incrementNumNulls();
PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, StatsHelper.V1.toParquetStatistics(stats), binaryType);
Assert.assertFalse("Stats should not include min/max: " + convertedStats, convertedStats.hasNonNullValue());
Assert.assertTrue("Stats should have null count: " + convertedStats, convertedStats.isNumNullsSet());
Assert.assertEquals("Stats should have 3 nulls: " + convertedStats, 3L, convertedStats.getNumNulls());
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class ParquetMetadataConverter method toParquetStatistics.
public static Statistics toParquetStatistics(org.apache.parquet.column.statistics.Statistics stats, int truncateLength) {
Statistics formatStats = new Statistics();
// value has been truncated and is a lower bound and not in the page.
if (!stats.isEmpty() && withinLimit(stats, truncateLength)) {
formatStats.setNull_count(stats.getNumNulls());
if (stats.hasNonNullValue()) {
byte[] min;
byte[] max;
if (stats instanceof BinaryStatistics && truncateLength != Integer.MAX_VALUE) {
BinaryTruncator truncator = BinaryTruncator.getTruncator(stats.type());
min = tuncateMin(truncator, truncateLength, stats.getMinBytes());
max = tuncateMax(truncator, truncateLength, stats.getMaxBytes());
} else {
min = stats.getMinBytes();
max = stats.getMaxBytes();
}
// trivially true for equal min-max values)
if (sortOrder(stats.type()) == SortOrder.SIGNED || Arrays.equals(min, max)) {
formatStats.setMin(min);
formatStats.setMax(max);
}
if (isMinMaxStatsSupported(stats.type()) || Arrays.equals(min, max)) {
formatStats.setMin_value(min);
formatStats.setMax_value(max);
}
}
}
return formatStats;
}
Aggregations