use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method testBinaryStatsWithTruncation.
// The number of minLen and maxLen shouldn't matter because the comparision is controlled by prefix
private void testBinaryStatsWithTruncation(int truncateLen, int minLen, int maxLen) {
BinaryStatistics stats = new BinaryStatistics();
byte[] min = generateRandomString("a", minLen).getBytes();
byte[] max = generateRandomString("b", maxLen).getBytes();
stats.updateStats(Binary.fromConstantByteArray(min));
stats.updateStats(Binary.fromConstantByteArray(max));
ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter(truncateLen);
org.apache.parquet.format.Statistics formatStats = metadataConverter.toParquetStatistics(stats);
if (minLen + maxLen >= ParquetMetadataConverter.MAX_STATS_SIZE) {
assertNull(formatStats.getMin_value());
assertNull(formatStats.getMax_value());
} else {
String minString = new String(min, Charset.forName("UTF-8"));
String minStatString = new String(formatStats.getMin_value(), Charset.forName("UTF-8"));
assertTrue(minStatString.compareTo(minString) <= 0);
String maxString = new String(max, Charset.forName("UTF-8"));
String maxStatString = new String(formatStats.getMax_value(), Charset.forName("UTF-8"));
assertTrue(maxStatString.compareTo(maxString) >= 0);
}
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by axbaretto.
the class ParquetFooterStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
// map from column name to ColumnDescriptor
Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
// map from column name to ColumnChunkMetaData
final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
// map from column name to MajorType
final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
// map from column name to SchemaElement
final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
if (fields.contains(schemaPath)) {
columnDescMap.put(schemaPath, column);
}
}
for (final SchemaElement se : fileMetaData.getSchema()) {
final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
if (fields.contains(schemaPath)) {
schemaElementMap.put(schemaPath, se);
}
}
for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
if (fields.contains(schemaPath)) {
columnChkMetaMap.put(schemaPath, colMetaData);
}
}
for (final SchemaPath path : fields) {
if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
ColumnDescriptor columnDesc = columnDescMap.get(path);
SchemaElement se = schemaElementMap.get(path);
ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
columnTypeMap.put(path, type);
Statistics stat = metaData.getStatistics();
if (type.getMinorType() == TypeProtos.MinorType.DATE) {
stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
}
statMap.put(path, new ColumnStatistics(stat, type));
} else {
final String columnName = path.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(path, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project drill by axbaretto.
the class ParquetMetaStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
// map from column to ColumnMetadata
final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
columnMetadataMap.put(schemaPath, columnMetadata);
}
for (final SchemaPath field : fields) {
final PrimitiveType.PrimitiveTypeName primitiveType;
final OriginalType originalType;
final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(field.getUnIndexed());
if (columnMetadata != null) {
final Object min = columnMetadata.getMinValue();
final Object max = columnMetadata.getMaxValue();
final Long numNull = columnMetadata.getNulls();
primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
int precision = 0;
int scale = 0;
// ColumnTypeMetadata_v3 stores information about scale and precision
if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v3) {
Metadata.ColumnTypeMetadata_v3 columnTypeInfo = ((Metadata.ParquetTableMetadata_v3) parquetTableMetadata).getColumnTypeInfo(columnMetadata.getName());
scale = columnTypeInfo.scale;
precision = columnTypeInfo.precision;
}
statMap.put(field, getStat(min, max, numNull, primitiveType, originalType, scale, precision));
} else {
final String columnName = field.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(field, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestColumnChunkMetaData method newMD.
private ColumnChunkMetaData newMD(long big) {
Set<Encoding> e = new HashSet<Encoding>();
PrimitiveTypeName t = BINARY;
ColumnPath p = ColumnPath.get("foo");
CompressionCodecName c = CompressionCodecName.GZIP;
BinaryStatistics s = new BinaryStatistics();
ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, big, 0, 0, 0, 0);
return md;
}
use of org.apache.parquet.column.statistics.BinaryStatistics in project parquet-mr by apache.
the class TestParquetMetadataConverter method createParquetMetaData.
private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding) {
MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }");
org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null);
List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
BlockMetaData blockMetaData = new BlockMetaData();
EncodingStats.Builder builder = new EncodingStats.Builder();
if (dicEncoding != null) {
builder.addDictEncoding(dicEncoding).build();
}
builder.addDataEncoding(dataEncoding);
EncodingStats es = builder.build();
Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
PrimitiveTypeName t = PrimitiveTypeName.INT32;
ColumnPath p = ColumnPath.get("col");
CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
BinaryStatistics s = new BinaryStatistics();
ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
blockMetaData.addColumn(md);
blockMetaDataList.add(blockMetaData);
return new ParquetMetadata(fileMetaData, blockMetaDataList);
}
Aggregations