use of org.apache.parquet.column.statistics.Statistics in project drill by apache.
the class ParquetFooterStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
// map from column name to ColumnDescriptor
Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
// map from column name to ColumnChunkMetaData
final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
// map from column name to MajorType
final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
// map from column name to SchemaElement
final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
if (fields.contains(schemaPath)) {
columnDescMap.put(schemaPath, column);
}
}
for (final SchemaElement se : fileMetaData.getSchema()) {
final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
if (fields.contains(schemaPath)) {
schemaElementMap.put(schemaPath, se);
}
}
for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
if (fields.contains(schemaPath)) {
columnChkMetaMap.put(schemaPath, colMetaData);
}
}
for (final SchemaPath path : fields) {
if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
ColumnDescriptor columnDesc = columnDescMap.get(path);
SchemaElement se = schemaElementMap.get(path);
ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
columnTypeMap.put(path, type);
Statistics stat = metaData.getStatistics();
if (type.getMinorType() == TypeProtos.MinorType.DATE) {
stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
}
statMap.put(path, new ColumnStatistics(stat, type));
} else {
final String columnName = path.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(path, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.statistics.Statistics in project drill by apache.
the class ParquetMetaStatCollector method collectColStat.
@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
Stopwatch timer = Stopwatch.createStarted();
// map from column to ColumnMetadata
final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
// map from column name to column statistics.
final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
columnMetadataMap.put(schemaPath, columnMetadata);
}
for (final SchemaPath schemaPath : fields) {
final PrimitiveType.PrimitiveTypeName primitiveType;
final OriginalType originalType;
final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(schemaPath);
if (columnMetadata != null) {
final Object min = columnMetadata.getMinValue();
final Object max = columnMetadata.getMaxValue();
final Long numNull = columnMetadata.getNulls();
primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
final Integer repetitionLevel = this.parquetTableMetadata.getRepetitionLevel(columnMetadata.getName());
statMap.put(schemaPath, getStat(min, max, numNull, primitiveType, originalType, repetitionLevel));
} else {
final String columnName = schemaPath.getRootSegment().getPath();
if (implicitColValues.containsKey(columnName)) {
TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
Statistics stat = new BinaryStatistics();
stat.setNumNulls(0);
byte[] val = implicitColValues.get(columnName).getBytes();
stat.setMinMaxFromBytes(val, val);
statMap.put(schemaPath, new ColumnStatistics(stat, type));
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
}
return statMap;
}
use of org.apache.parquet.column.statistics.Statistics in project drill by apache.
the class ParquetMetaStatCollector method getStat.
private ColumnStatistics getStat(Object min, Object max, Long numNull, PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType, Integer repetitionLevel) {
Statistics stat = Statistics.getStatsBasedOnType(primitiveType);
Statistics convertedStat = stat;
TypeProtos.MajorType type = ParquetGroupScan.getType(primitiveType, originalType);
// Change to repeated if repetitionLevel > 0
if (repetitionLevel != null && repetitionLevel > 0) {
type = TypeProtos.MajorType.newBuilder().setMinorType(type.getMinorType()).setMode(TypeProtos.DataMode.REPEATED).build();
}
if (numNull != null) {
stat.setNumNulls(numNull.longValue());
}
if (min != null && max != null) {
switch(type.getMinorType()) {
case INT:
case TIME:
((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), Integer.parseInt(max.toString()));
break;
case BIGINT:
case TIMESTAMP:
((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), Long.parseLong(max.toString()));
break;
case FLOAT4:
((FloatStatistics) stat).setMinMax(Float.parseFloat(min.toString()), Float.parseFloat(max.toString()));
break;
case FLOAT8:
((DoubleStatistics) stat).setMinMax(Double.parseDouble(min.toString()), Double.parseDouble(max.toString()));
break;
case DATE:
convertedStat = new LongStatistics();
convertedStat.setNumNulls(stat.getNumNulls());
final long minMS = convertToDrillDateValue(Integer.parseInt(min.toString()));
final long maxMS = convertToDrillDateValue(Integer.parseInt(max.toString()));
((LongStatistics) convertedStat).setMinMax(minMS, maxMS);
break;
default:
}
}
return new ColumnStatistics(convertedStat, type);
}
use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.
the class RangeExprEvaluator method visitFunctionHolderExpression.
@Override
public Statistics visitFunctionHolderExpression(FunctionHolderExpression holderExpr, Void value) throws RuntimeException {
FuncHolder funcHolder = holderExpr.getHolder();
if (!(funcHolder instanceof DrillSimpleFuncHolder)) {
// Only Drill function is allowed.
return null;
}
final String funcName = ((DrillSimpleFuncHolder) funcHolder).getRegisteredNames()[0];
if (CastFunctions.isCastFunction(funcName)) {
Statistics stat = holderExpr.args.get(0).accept(this, null);
if (stat != null && !stat.isEmpty()) {
return evalCastFunc(holderExpr, stat);
}
}
return null;
}
use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.
the class ParquetMetaStatCollector method getStat.
/**
* Builds column statistics using given primitiveType, originalType, scale,
* precision, numNull, min and max values.
*
* @param min min value for statistics
* @param max max value for statistics
* @param numNull num_nulls for statistics
* @param primitiveType type that determines statistics class
* @param originalType type that determines statistics class
* @param scale scale value (used for DECIMAL type)
* @param precision precision value (used for DECIMAL type)
* @return column statistics
*/
private ColumnStatistics getStat(Object min, Object max, Long numNull, PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType, int scale, int precision) {
Statistics stat = Statistics.getStatsBasedOnType(primitiveType);
Statistics convertedStat = stat;
TypeProtos.MajorType type = ParquetGroupScan.getType(primitiveType, originalType, scale, precision);
if (numNull != null) {
stat.setNumNulls(numNull);
}
if (min != null && max != null) {
switch(type.getMinorType()) {
case INT:
case TIME:
((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), Integer.parseInt(max.toString()));
break;
case BIGINT:
case TIMESTAMP:
((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), Long.parseLong(max.toString()));
break;
case FLOAT4:
((FloatStatistics) stat).setMinMax(Float.parseFloat(min.toString()), Float.parseFloat(max.toString()));
break;
case FLOAT8:
((DoubleStatistics) stat).setMinMax(Double.parseDouble(min.toString()), Double.parseDouble(max.toString()));
break;
case DATE:
convertedStat = new LongStatistics();
convertedStat.setNumNulls(stat.getNumNulls());
final long minMS = convertToDrillDateValue(Integer.parseInt(min.toString()));
final long maxMS = convertToDrillDateValue(Integer.parseInt(max.toString()));
((LongStatistics) convertedStat).setMinMax(minMS, maxMS);
break;
case BIT:
((BooleanStatistics) stat).setMinMax(Boolean.parseBoolean(min.toString()), Boolean.parseBoolean(max.toString()));
break;
default:
}
}
return new ColumnStatistics(convertedStat, type);
}
Aggregations