Search in sources :

Example 1 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by apache.

the class ParquetFooterStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    ParquetReaderUtility.DateCorruptionStatus containsCorruptDates = ParquetReaderUtility.detectCorruptDates(footer, new ArrayList<>(fields), autoCorrectCorruptDates);
    // map from column name to ColumnDescriptor
    Map<SchemaPath, ColumnDescriptor> columnDescMap = new HashMap<>();
    // map from column name to ColumnChunkMetaData
    final Map<SchemaPath, ColumnChunkMetaData> columnChkMetaMap = new HashMap<>();
    // map from column name to MajorType
    final Map<SchemaPath, TypeProtos.MajorType> columnTypeMap = new HashMap<>();
    // map from column name to SchemaElement
    final Map<SchemaPath, SchemaElement> schemaElementMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    final org.apache.parquet.format.FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
    for (final ColumnDescriptor column : footer.getFileMetaData().getSchema().getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(column.getPath());
        if (fields.contains(schemaPath)) {
            columnDescMap.put(schemaPath, column);
        }
    }
    for (final SchemaElement se : fileMetaData.getSchema()) {
        final SchemaPath schemaPath = SchemaPath.getSimplePath(se.getName());
        if (fields.contains(schemaPath)) {
            schemaElementMap.put(schemaPath, se);
        }
    }
    for (final ColumnChunkMetaData colMetaData : footer.getBlocks().get(rowGroupIndex).getColumns()) {
        final SchemaPath schemaPath = SchemaPath.getCompoundPath(colMetaData.getPath().toArray());
        if (fields.contains(schemaPath)) {
            columnChkMetaMap.put(schemaPath, colMetaData);
        }
    }
    for (final SchemaPath path : fields) {
        if (columnDescMap.containsKey(path) && schemaElementMap.containsKey(path) && columnChkMetaMap.containsKey(path)) {
            ColumnDescriptor columnDesc = columnDescMap.get(path);
            SchemaElement se = schemaElementMap.get(path);
            ColumnChunkMetaData metaData = columnChkMetaMap.get(path);
            TypeProtos.MajorType type = ParquetToDrillTypeConverter.toMajorType(columnDesc.getType(), se.getType_length(), getDataMode(columnDesc), se, options);
            columnTypeMap.put(path, type);
            Statistics stat = metaData.getStatistics();
            if (type.getMinorType() == TypeProtos.MinorType.DATE) {
                stat = convertDateStatIfNecessary(metaData.getStatistics(), containsCorruptDates);
            }
            statMap.put(path, new ColumnStatistics(stat, type));
        } else {
            final String columnName = path.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(path, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) Stopwatch(com.google.common.base.Stopwatch) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) SchemaPath(org.apache.drill.common.expression.SchemaPath) SchemaElement(org.apache.parquet.format.SchemaElement) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ParquetReaderUtility(org.apache.drill.exec.store.parquet.ParquetReaderUtility) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter)

Example 2 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by apache.

the class ParquetMetaStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    // map from column to ColumnMetadata
    final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
        SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
        columnMetadataMap.put(schemaPath, columnMetadata);
    }
    for (final SchemaPath schemaPath : fields) {
        final PrimitiveType.PrimitiveTypeName primitiveType;
        final OriginalType originalType;
        final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(schemaPath);
        if (columnMetadata != null) {
            final Object min = columnMetadata.getMinValue();
            final Object max = columnMetadata.getMaxValue();
            final Long numNull = columnMetadata.getNulls();
            primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
            originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
            final Integer repetitionLevel = this.parquetTableMetadata.getRepetitionLevel(columnMetadata.getName());
            statMap.put(schemaPath, getStat(min, max, numNull, primitiveType, originalType, repetitionLevel));
        } else {
            final String columnName = schemaPath.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(schemaPath, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) Stopwatch(com.google.common.base.Stopwatch) Metadata(org.apache.drill.exec.store.parquet.Metadata) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 3 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by apache.

the class ParquetMetaStatCollector method getStat.

private ColumnStatistics getStat(Object min, Object max, Long numNull, PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType, Integer repetitionLevel) {
    Statistics stat = Statistics.getStatsBasedOnType(primitiveType);
    Statistics convertedStat = stat;
    TypeProtos.MajorType type = ParquetGroupScan.getType(primitiveType, originalType);
    // Change to repeated if repetitionLevel > 0
    if (repetitionLevel != null && repetitionLevel > 0) {
        type = TypeProtos.MajorType.newBuilder().setMinorType(type.getMinorType()).setMode(TypeProtos.DataMode.REPEATED).build();
    }
    if (numNull != null) {
        stat.setNumNulls(numNull.longValue());
    }
    if (min != null && max != null) {
        switch(type.getMinorType()) {
            case INT:
            case TIME:
                ((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), Integer.parseInt(max.toString()));
                break;
            case BIGINT:
            case TIMESTAMP:
                ((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), Long.parseLong(max.toString()));
                break;
            case FLOAT4:
                ((FloatStatistics) stat).setMinMax(Float.parseFloat(min.toString()), Float.parseFloat(max.toString()));
                break;
            case FLOAT8:
                ((DoubleStatistics) stat).setMinMax(Double.parseDouble(min.toString()), Double.parseDouble(max.toString()));
                break;
            case DATE:
                convertedStat = new LongStatistics();
                convertedStat.setNumNulls(stat.getNumNulls());
                final long minMS = convertToDrillDateValue(Integer.parseInt(min.toString()));
                final long maxMS = convertToDrillDateValue(Integer.parseInt(max.toString()));
                ((LongStatistics) convertedStat).setMinMax(minMS, maxMS);
                break;
            default:
        }
    }
    return new ColumnStatistics(convertedStat, type);
}
Also used : LongStatistics(org.apache.parquet.column.statistics.LongStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos)

Example 4 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.

the class RangeExprEvaluator method visitFunctionHolderExpression.

@Override
public Statistics visitFunctionHolderExpression(FunctionHolderExpression holderExpr, Void value) throws RuntimeException {
    FuncHolder funcHolder = holderExpr.getHolder();
    if (!(funcHolder instanceof DrillSimpleFuncHolder)) {
        // Only Drill function is allowed.
        return null;
    }
    final String funcName = ((DrillSimpleFuncHolder) funcHolder).getRegisteredNames()[0];
    if (CastFunctions.isCastFunction(funcName)) {
        Statistics stat = holderExpr.args.get(0).accept(this, null);
        if (stat != null && !stat.isEmpty()) {
            return evalCastFunc(holderExpr, stat);
        }
    }
    return null;
}
Also used : DrillSimpleFuncHolder(org.apache.drill.exec.expr.fn.DrillSimpleFuncHolder) FuncHolder(org.apache.drill.common.expression.fn.FuncHolder) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) ColumnStatistics(org.apache.drill.exec.store.parquet.stat.ColumnStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) DrillSimpleFuncHolder(org.apache.drill.exec.expr.fn.DrillSimpleFuncHolder)

Example 5 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.

the class ParquetMetaStatCollector method getStat.

/**
 * Builds column statistics using given primitiveType, originalType, scale,
 * precision, numNull, min and max values.
 *
 * @param min             min value for statistics
 * @param max             max value for statistics
 * @param numNull         num_nulls for statistics
 * @param primitiveType   type that determines statistics class
 * @param originalType    type that determines statistics class
 * @param scale           scale value (used for DECIMAL type)
 * @param precision       precision value (used for DECIMAL type)
 * @return column statistics
 */
private ColumnStatistics getStat(Object min, Object max, Long numNull, PrimitiveType.PrimitiveTypeName primitiveType, OriginalType originalType, int scale, int precision) {
    Statistics stat = Statistics.getStatsBasedOnType(primitiveType);
    Statistics convertedStat = stat;
    TypeProtos.MajorType type = ParquetGroupScan.getType(primitiveType, originalType, scale, precision);
    if (numNull != null) {
        stat.setNumNulls(numNull);
    }
    if (min != null && max != null) {
        switch(type.getMinorType()) {
            case INT:
            case TIME:
                ((IntStatistics) stat).setMinMax(Integer.parseInt(min.toString()), Integer.parseInt(max.toString()));
                break;
            case BIGINT:
            case TIMESTAMP:
                ((LongStatistics) stat).setMinMax(Long.parseLong(min.toString()), Long.parseLong(max.toString()));
                break;
            case FLOAT4:
                ((FloatStatistics) stat).setMinMax(Float.parseFloat(min.toString()), Float.parseFloat(max.toString()));
                break;
            case FLOAT8:
                ((DoubleStatistics) stat).setMinMax(Double.parseDouble(min.toString()), Double.parseDouble(max.toString()));
                break;
            case DATE:
                convertedStat = new LongStatistics();
                convertedStat.setNumNulls(stat.getNumNulls());
                final long minMS = convertToDrillDateValue(Integer.parseInt(min.toString()));
                final long maxMS = convertToDrillDateValue(Integer.parseInt(max.toString()));
                ((LongStatistics) convertedStat).setMinMax(minMS, maxMS);
                break;
            case BIT:
                ((BooleanStatistics) stat).setMinMax(Boolean.parseBoolean(min.toString()), Boolean.parseBoolean(max.toString()));
                break;
            default:
        }
    }
    return new ColumnStatistics(convertedStat, type);
}
Also used : LongStatistics(org.apache.parquet.column.statistics.LongStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos)

Aggregations

Statistics (org.apache.parquet.column.statistics.Statistics)20 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)14 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)14 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)12 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)12 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)11 PrimitiveType (org.apache.parquet.schema.PrimitiveType)11 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)9 TypeProtos (org.apache.drill.common.types.TypeProtos)6 HashMap (java.util.HashMap)5 Stopwatch (com.google.common.base.Stopwatch)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 Slice (io.airlift.slice.Slice)2 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)2 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)2 Binary (org.apache.parquet.io.api.Binary)2 Test (org.junit.Test)2 Domain (com.facebook.presto.common.predicate.Domain)1 Range (com.facebook.presto.common.predicate.Range)1