use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testMissingValuesFromStats.
@Test
public void testMissingValuesFromStats() {
ParquetMetadataConverter converter = new ParquetMetadataConverter();
PrimitiveType type = Types.required(PrimitiveTypeName.INT32).named("test_int32");
org.apache.parquet.format.Statistics formatStats = new org.apache.parquet.format.Statistics();
Statistics<?> stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
assertFalse(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());
assertTrue(stats.isEmpty());
assertEquals(-1, stats.getNumNulls());
formatStats.clear();
formatStats.setMin(BytesUtils.intToBytes(-100));
formatStats.setMax(BytesUtils.intToBytes(100));
stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
assertFalse(stats.isNumNullsSet());
assertTrue(stats.hasNonNullValue());
assertFalse(stats.isEmpty());
assertEquals(-1, stats.getNumNulls());
assertEquals(-100, stats.genericGetMin());
assertEquals(100, stats.genericGetMax());
formatStats.clear();
formatStats.setNull_count(2000);
stats = converter.fromParquetStatistics(Version.FULL_VERSION, formatStats, type);
assertTrue(stats.isNumNullsSet());
assertFalse(stats.hasNonNullValue());
assertFalse(stats.isEmpty());
assertEquals(2000, stats.getNumNulls());
}
use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testStillUseStatsWithSignedSortOrderIfSingleValue.
private void testStillUseStatsWithSignedSortOrderIfSingleValue(StatsHelper helper) {
ParquetMetadataConverter converter = new ParquetMetadataConverter();
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, ParquetMetadataConverter.toParquetStatistics(stats), binaryType);
Assert.assertFalse("Stats should not be empty: " + convertedStats, convertedStats.isEmpty());
Assert.assertArrayEquals("min == max: " + convertedStats, convertedStats.getMaxBytes(), convertedStats.getMinBytes());
}
use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testUseStatsWithSignedSortOrder.
private void testUseStatsWithSignedSortOrder(StatsHelper helper) {
// override defaults and use stats that were accumulated using signed order
Configuration conf = new Configuration();
conf.setBoolean("parquet.strings.signed-min-max.enabled", true);
ParquetMetadataConverter converter = new ParquetMetadataConverter(conf);
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("A"));
stats.incrementNumNulls();
stats.updateStats(Binary.fromString("z"));
stats.incrementNumNulls();
PrimitiveType binaryType = Types.required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("b");
Statistics convertedStats = converter.fromParquetStatistics(Version.FULL_VERSION, helper.toParquetStatistics(stats), binaryType);
Assert.assertFalse("Stats should not be empty", convertedStats.isEmpty());
Assert.assertTrue(convertedStats.isNumNullsSet());
Assert.assertEquals("Should have 3 nulls", 3, convertedStats.getNumNulls());
if (helper == StatsHelper.V1) {
assertFalse("Min-max should be null for V1 stats", convertedStats.hasNonNullValue());
} else {
Assert.assertEquals("Should have correct min (unsigned sort)", Binary.fromString("A"), convertedStats.genericGetMin());
Assert.assertEquals("Should have correct max (unsigned sort)", Binary.fromString("z"), convertedStats.genericGetMax());
}
}
use of org.apache.parquet.column.statistics.Statistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testBinaryStats.
private void testBinaryStats(StatsHelper helper) {
// make fake stats and verify the size check
BinaryStatistics stats = new BinaryStatistics();
stats.incrementNumNulls(3004);
byte[] min = new byte[904];
byte[] max = new byte[2388];
stats.updateStats(Binary.fromConstantByteArray(min));
stats.updateStats(Binary.fromConstantByteArray(max));
long totalLen = min.length + max.length;
Assert.assertFalse("Should not be smaller than min + max size", stats.isSmallerThan(totalLen));
Assert.assertTrue("Should be smaller than min + max size + 1", stats.isSmallerThan(totalLen + 1));
org.apache.parquet.format.Statistics formatStats = helper.toParquetStatistics(stats);
assertFalse("Min should not be set", formatStats.isSetMin());
assertFalse("Max should not be set", formatStats.isSetMax());
if (helper == StatsHelper.V2) {
Assert.assertArrayEquals("Min_value should match", min, formatStats.getMin_value());
Assert.assertArrayEquals("Max_value should match", max, formatStats.getMax_value());
}
Assert.assertEquals("Num nulls should match", 3004, formatStats.getNull_count());
// convert to empty stats because the values are too large
stats.setMinMaxFromBytes(max, max);
formatStats = helper.toParquetStatistics(stats);
Assert.assertFalse("Min should not be set", formatStats.isSetMin());
Assert.assertFalse("Max should not be set", formatStats.isSetMax());
Assert.assertFalse("Min_value should not be set", formatStats.isSetMin_value());
Assert.assertFalse("Max_value should not be set", formatStats.isSetMax_value());
Assert.assertFalse("Num nulls should not be set", formatStats.isSetNull_count());
Statistics roundTripStats = ParquetMetadataConverter.fromParquetStatisticsInternal(Version.FULL_VERSION, formatStats, new PrimitiveType(Repetition.OPTIONAL, PrimitiveTypeName.BINARY, ""), ParquetMetadataConverter.SortOrder.SIGNED);
Assert.assertTrue(roundTripStats.isEmpty());
}
use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.
the class ParquetFooterStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
// map from column name to ColumnDescriptor
Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
// map from column name to ColumnChunkMetaData
final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
// map from column name to MajorType
final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
// map from column name to SchemaElement
final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
if (fields.contains(schemaPath)) {
columnDescMap.put(schemaPath, column);
}
}
for (final SchemaElement se : fileMetaData.getSchema()) {
final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
if (fields.contains(schemaPath)) {
schemaElementMap.put(schemaPath, se);
}
}
for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
if (fields.contains(schemaPath)) {
columnChkMetaMap.put(schemaPath, colMetaData);
}
}
for (final SchemaPath path : fields) {
if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
ColumnDescriptor columnDesc = columnDescMap.get(path);
SchemaElement se = schemaElementMap.get(path);
ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
columnTypeMap.put(path, type);
Statistics stat = metaData.getStatistics();
if (type.getMinorType() == TypeProtos.MinorType.DATE) {
stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
}
statMap.put(path, new ColumnStatistics(stat, type));
} else {
final String columnName = path.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(path, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
Aggregations