Search in sources :

Example 16 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project drill by axbaretto.

the class ParquetMetaStatCollector method collectColStat.

@Override
public Map<SchemaPath, ColumnStatistics> collectColStat(Set<SchemaPath> fields) {
    Stopwatch timer = Stopwatch.createStarted();
    // map from column to ColumnMetadata
    final Map<SchemaPath, Metadata.ColumnMetadata> columnMetadataMap = new HashMap<>();
    // map from column name to column statistics.
    final Map<SchemaPath, ColumnStatistics> statMap = new HashMap<>();
    for (final Metadata.ColumnMetadata columnMetadata : columnMetadataList) {
        SchemaPath schemaPath = SchemaPath.getCompoundPath(columnMetadata.getName());
        columnMetadataMap.put(schemaPath, columnMetadata);
    }
    for (final SchemaPath field : fields) {
        final PrimitiveType.PrimitiveTypeName primitiveType;
        final OriginalType originalType;
        final Metadata.ColumnMetadata columnMetadata = columnMetadataMap.get(field.getUnIndexed());
        if (columnMetadata != null) {
            final Object min = columnMetadata.getMinValue();
            final Object max = columnMetadata.getMaxValue();
            final Long numNull = columnMetadata.getNulls();
            primitiveType = this.parquetTableMetadata.getPrimitiveType(columnMetadata.getName());
            originalType = this.parquetTableMetadata.getOriginalType(columnMetadata.getName());
            int precision = 0;
            int scale = 0;
            // ColumnTypeMetadata_v3 stores information about scale and precision
            if (parquetTableMetadata instanceof Metadata.ParquetTableMetadata_v3) {
                Metadata.ColumnTypeMetadata_v3 columnTypeInfo = ((Metadata.ParquetTableMetadata_v3) parquetTableMetadata).getColumnTypeInfo(columnMetadata.getName());
                scale = columnTypeInfo.scale;
                precision = columnTypeInfo.precision;
            }
            statMap.put(field, getStat(min, max, numNull, primitiveType, originalType, scale, precision));
        } else {
            final String columnName = field.getRootSegment().getPath();
            if (implicitColValues.containsKey(columnName)) {
                TypeProtos.MajorType type = Types.required(TypeProtos.MinorType.VARCHAR);
                Statistics stat = new BinaryStatistics();
                stat.setNumNulls(0);
                byte[] val = implicitColValues.get(columnName).getBytes();
                stat.setMinMaxFromBytes(val, val);
                statMap.put(field, new ColumnStatistics(stat, type));
            }
        }
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Took {} ms to column statistics for row group", timer.elapsed(TimeUnit.MILLISECONDS));
    }
    return statMap;
}
Also used : HashMap(java.util.HashMap) Stopwatch(com.google.common.base.Stopwatch) Metadata(org.apache.drill.exec.store.parquet.Metadata) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) TypeProtos(org.apache.drill.common.types.TypeProtos) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) OriginalType(org.apache.parquet.schema.OriginalType) SchemaPath(org.apache.drill.common.expression.SchemaPath) PrimitiveType(org.apache.parquet.schema.PrimitiveType)

Example 17 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.

the class TestTupleDomainParquetPredicate method stringColumnStats.

private static Statistics stringColumnStats(String minimum, String maximum) {
    Statistics.Builder builder = Statistics.getBuilderForReading(new PrimitiveType(OPTIONAL, BINARY, "testFile", UTF8));
    builder.withMin(minimum.getBytes()).withMax(maximum.getBytes()).withNumNulls(0);
    return builder.build();
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) BooleanStatistics(org.apache.parquet.column.statistics.BooleanStatistics) IntStatistics(org.apache.parquet.column.statistics.IntStatistics) FloatStatistics(org.apache.parquet.column.statistics.FloatStatistics) Statistics(org.apache.parquet.column.statistics.Statistics) DoubleStatistics(org.apache.parquet.column.statistics.DoubleStatistics) LongStatistics(org.apache.parquet.column.statistics.LongStatistics)

Example 18 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.

the class AggregatedParquetPageSource method getNumNulls.

private long getNumNulls(ParquetMetadata parquetMetadata, int columnIndex) {
    long numNulls = 0;
    for (BlockMetaData blockMetaData : parquetMetadata.getBlocks()) {
        Statistics statistics = blockMetaData.getColumns().get(columnIndex).getStatistics();
        if (!statistics.isNumNullsSet()) {
            throw new UnsupportedOperationException("Number of nulls not set for parquet file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
        }
        numNulls += statistics.getNumNulls();
    }
    completedBytes += INTEGER.getFixedSize();
    return numNulls;
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Statistics(org.apache.parquet.column.statistics.Statistics)

Example 19 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.

the class AggregatedParquetPageSource method writeMinMax.

private void writeMinMax(ParquetMetadata parquetMetadata, int columnIndex, BlockBuilder blockBuilder, Type type, HiveType hiveType, boolean isMin) {
    org.apache.parquet.schema.Type parquetType = parquetMetadata.getFileMetaData().getSchema().getType(columnIndex);
    if (parquetType instanceof GroupType) {
        throw new IllegalArgumentException("Unsupported type : " + parquetType.toString());
    }
    Object value = null;
    for (BlockMetaData blockMetaData : parquetMetadata.getBlocks()) {
        Statistics statistics = blockMetaData.getColumns().get(columnIndex).getStatistics();
        if (!statistics.hasNonNullValue()) {
            throw new UnsupportedOperationException("No min/max found for parquet file. Set session property hive.pushdown_partial_aggregations_into_scan=false and execute query again");
        }
        if (isMin) {
            Object currentValue = statistics.genericGetMin();
            if (currentValue != null && (value == null || ((Comparable) currentValue).compareTo(value) < 0)) {
                value = currentValue;
            }
        } else {
            Object currentValue = statistics.genericGetMax();
            if (currentValue != null && (value == null || ((Comparable) currentValue).compareTo(value) > 0)) {
                value = currentValue;
            }
        }
    }
    if (type instanceof FixedWidthType) {
        completedBytes += ((FixedWidthType) type).getFixedSize();
    }
    if (value == null) {
        blockBuilder.appendNull();
        return;
    }
    PrimitiveType.PrimitiveTypeName parquetTypeName = parquetType.asPrimitiveType().getPrimitiveTypeName();
    switch(parquetTypeName) {
        case INT32:
            {
                blockBuilder.writeLong(Long.valueOf((Integer) value));
                break;
            }
        case INT64:
            {
                blockBuilder.writeLong((Long) value);
                break;
            }
        case INT96:
            {
                blockBuilder.writeLong(getTimestampMillis(((Binary) value).getBytes(), 0));
                break;
            }
        case FLOAT:
            {
                blockBuilder.writeLong(floatToRawIntBits((Float) value));
                break;
            }
        case DOUBLE:
            {
                type.writeDouble(blockBuilder, (Double) value);
                break;
            }
        case FIXED_LEN_BYTE_ARRAY:
            {
                byte[] valBytes = ((Binary) value).getBytes();
                DecimalType decimalType = (DecimalType) hiveType.getType(typeManager);
                if (decimalType.isShort()) {
                    blockBuilder.writeLong(getShortDecimalValue(valBytes));
                } else {
                    BigInteger bigIntValue = new BigInteger(valBytes);
                    type.writeSlice(blockBuilder, encodeUnscaledValue(bigIntValue));
                }
                break;
            }
        case BINARY:
            {
                Slice slice = Slices.wrappedBuffer(((Binary) value).getBytes());
                blockBuilder.writeBytes(slice, 0, slice.length()).closeEntry();
                completedBytes += slice.length();
                break;
            }
        case BOOLEAN:
        default:
            throw new IllegalArgumentException("Unexpected parquet type name: " + parquetTypeName);
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Statistics(org.apache.parquet.column.statistics.Statistics) GroupType(org.apache.parquet.schema.GroupType) Slice(io.airlift.slice.Slice) DecimalType(com.facebook.presto.common.type.DecimalType) BigInteger(java.math.BigInteger) PrimitiveType(org.apache.parquet.schema.PrimitiveType) Binary(org.apache.parquet.io.api.Binary) FixedWidthType(com.facebook.presto.common.type.FixedWidthType)

Example 20 with Statistics

use of org.apache.parquet.column.statistics.Statistics in project presto by prestodb.

the class TupleDomainParquetPredicate method getDomain.

/**
 * Get a domain for the ranges defined by each pair of elements from {@code minimums} and {@code maximums}.
 * Both arrays must have the same length.
 */
private static Domain getDomain(ColumnDescriptor column, Type type, List<Object> minimums, List<Object> maximums, boolean hasNullValue) {
    checkArgument(minimums.size() == maximums.size(), "Expected minimums and maximums to have the same size");
    List<Range> ranges = new ArrayList<>();
    if (type.equals(BOOLEAN)) {
        boolean hasTrueValues = minimums.stream().anyMatch(value -> (boolean) value) || maximums.stream().anyMatch(value -> (boolean) value);
        boolean hasFalseValues = minimums.stream().anyMatch(value -> !(boolean) value) || maximums.stream().anyMatch(value -> !(boolean) value);
        if (hasTrueValues && hasFalseValues) {
            return Domain.all(type);
        }
        if (hasTrueValues) {
            return Domain.create(ValueSet.of(type, true), hasNullValue);
        }
        if (hasFalseValues) {
            return Domain.create(ValueSet.of(type, false), hasNullValue);
        }
        // All nulls case is handled earlier
        throw new VerifyException("Impossible boolean statistics");
    }
    if ((type.equals(BIGINT) || type.equals(TINYINT) || type.equals(SMALLINT) || type.equals(INTEGER))) {
        for (int i = 0; i < minimums.size(); i++) {
            long min = asLong(minimums.get(i));
            long max = asLong(maximums.get(i));
            if (isStatisticsOverflow(type, min, max)) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (type.equals(REAL)) {
        for (int i = 0; i < minimums.size(); i++) {
            Float min = (Float) minimums.get(i);
            Float max = (Float) maximums.get(i);
            if (min.isNaN() || max.isNaN()) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, (long) floatToRawIntBits(min), true, (long) floatToRawIntBits(max), true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (type.equals(DOUBLE)) {
        for (int i = 0; i < minimums.size(); i++) {
            Double min = (Double) minimums.get(i);
            Double max = (Double) maximums.get(i);
            if (min.isNaN() || max.isNaN()) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (isVarcharType(type)) {
        for (int i = 0; i < minimums.size(); i++) {
            Slice min = Slices.wrappedBuffer(((Binary) minimums.get(i)).toByteBuffer());
            Slice max = Slices.wrappedBuffer(((Binary) maximums.get(i)).toByteBuffer());
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    if (type.equals(DATE)) {
        for (int i = 0; i < minimums.size(); i++) {
            long min = asLong(minimums.get(i));
            long max = asLong(maximums.get(i));
            if (isStatisticsOverflow(type, min, max)) {
                return Domain.create(ValueSet.all(type), hasNullValue);
            }
            ranges.add(Range.range(type, min, true, max, true));
        }
        checkArgument(!ranges.isEmpty(), "cannot use empty ranges");
        return Domain.create(ValueSet.ofRanges(ranges), hasNullValue);
    }
    return Domain.create(ValueSet.all(type), hasNullValue);
}
Also used : PrimitiveType(org.apache.parquet.schema.PrimitiveType) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) PredicateUtils.isStatisticsOverflow(com.facebook.presto.parquet.predicate.PredicateUtils.isStatisticsOverflow) FilterPredicate(org.apache.parquet.filter2.predicate.FilterPredicate) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) FilterApi(org.apache.parquet.filter2.predicate.FilterApi) ByteBuffer(java.nio.ByteBuffer) ParquetCorruptionException(com.facebook.presto.parquet.ParquetCorruptionException) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Slices(io.airlift.slice.Slices) Map(java.util.Map) Varchars.isVarcharType(com.facebook.presto.common.type.Varchars.isVarcharType) UserDefinedPredicate(org.apache.parquet.filter2.predicate.UserDefinedPredicate) DOUBLE(com.facebook.presto.common.type.DoubleType.DOUBLE) BINARY(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY) ColumnIndexStore(org.apache.parquet.internal.filter2.columnindex.ColumnIndexStore) String.format(java.lang.String.format) Range(com.facebook.presto.common.predicate.Range) Binary(org.apache.parquet.io.api.Binary) Serializable(java.io.Serializable) INT64(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64) List(java.util.List) LITTLE_ENDIAN(java.nio.ByteOrder.LITTLE_ENDIAN) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) INTEGER(com.facebook.presto.common.type.IntegerType.INTEGER) Optional(java.util.Optional) DictionaryPage(com.facebook.presto.parquet.DictionaryPage) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) Slice(io.airlift.slice.Slice) TINYINT(com.facebook.presto.common.type.TinyintType.TINYINT) HashMap(java.util.HashMap) INT32(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32) Function(java.util.function.Function) DATE(com.facebook.presto.common.type.DateType.DATE) REAL(com.facebook.presto.common.type.RealType.REAL) ArrayList(java.util.ArrayList) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) ParquetDataSourceId(com.facebook.presto.parquet.ParquetDataSourceId) PrimitiveTypeName(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName) BOOLEAN(com.facebook.presto.common.type.BooleanType.BOOLEAN) Type(com.facebook.presto.common.type.Type) VerifyException(com.google.common.base.VerifyException) Statistics(org.apache.parquet.column.statistics.Statistics) BIGINT(com.facebook.presto.common.type.BigintType.BIGINT) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain) Dictionary(com.facebook.presto.parquet.dictionary.Dictionary) FLOAT(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.FLOAT) SMALLINT(com.facebook.presto.common.type.SmallintType.SMALLINT) VisibleForTesting(com.google.common.annotations.VisibleForTesting) ValueSet(com.facebook.presto.common.predicate.ValueSet) VerifyException(com.google.common.base.VerifyException) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) Range(com.facebook.presto.common.predicate.Range)

Aggregations

Statistics (org.apache.parquet.column.statistics.Statistics)20 IntStatistics (org.apache.parquet.column.statistics.IntStatistics)14 LongStatistics (org.apache.parquet.column.statistics.LongStatistics)14 DoubleStatistics (org.apache.parquet.column.statistics.DoubleStatistics)12 FloatStatistics (org.apache.parquet.column.statistics.FloatStatistics)12 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)11 PrimitiveType (org.apache.parquet.schema.PrimitiveType)11 BooleanStatistics (org.apache.parquet.column.statistics.BooleanStatistics)9 TypeProtos (org.apache.drill.common.types.TypeProtos)6 HashMap (java.util.HashMap)5 Stopwatch (com.google.common.base.Stopwatch)4 SchemaPath (org.apache.drill.common.expression.SchemaPath)4 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)4 Slice (io.airlift.slice.Slice)2 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)2 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)2 Binary (org.apache.parquet.io.api.Binary)2 Test (org.junit.Test)2 Domain (com.facebook.presto.common.predicate.Domain)1 Range (com.facebook.presto.common.predicate.Range)1