Search in sources :

Example 16 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class TupleDomainOrcPredicate method getDomain.

@VisibleForTesting
public static Domain getDomain(Type type, long rowCount, ColumnStatistics columnStatistics) {
    if (rowCount == 0) {
        return Domain.none(type);
    }
    if (columnStatistics == null) {
        return Domain.all(type);
    }
    if (columnStatistics.hasNumberOfValues() && columnStatistics.getNumberOfValues() == 0) {
        return Domain.onlyNull(type);
    }
    boolean hasNullValue = columnStatistics.getNumberOfValues() != rowCount;
    if (type instanceof TimeType && columnStatistics.getIntegerStatistics() != null) {
        // This is the representation of TIME used by Iceberg
        return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics(), value -> ((long) value) * Timestamps.PICOSECONDS_PER_MICROSECOND);
    }
    if (type.getJavaType() == boolean.class && columnStatistics.getBooleanStatistics() != null) {
        BooleanStatistics booleanStatistics = columnStatistics.getBooleanStatistics();
        boolean hasTrueValues = (booleanStatistics.getTrueValueCount() != 0);
        boolean hasFalseValues = (columnStatistics.getNumberOfValues() != booleanStatistics.getTrueValueCount());
        if (hasTrueValues && hasFalseValues) {
            return Domain.all(BOOLEAN);
        }
        if (hasTrueValues) {
            return Domain.create(ValueSet.of(BOOLEAN, true), hasNullValue);
        }
        if (hasFalseValues) {
            return Domain.create(ValueSet.of(BOOLEAN, false), hasNullValue);
        }
    } else if (isShortDecimal(type) && columnStatistics.getDecimalStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> rescale(value, (DecimalType) type).unscaledValue().longValue());
    } else if (isLongDecimal(type) && columnStatistics.getDecimalStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> Int128.valueOf(rescale(value, (DecimalType) type).unscaledValue()));
    } else if (type instanceof CharType && columnStatistics.getStringStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getStringStatistics(), value -> truncateToLengthAndTrimSpaces(value, type));
    } else if (type instanceof VarcharType && columnStatistics.getStringStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getStringStatistics());
    } else if (type instanceof DateType && columnStatistics.getDateStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDateStatistics(), value -> (long) value);
    } else if ((type.equals(TIMESTAMP_MILLIS) || type.equals(TIMESTAMP_MICROS)) && columnStatistics.getTimestampStatistics() != null) {
        // upper bound of the domain we create must be adjusted accordingly, to includes the rounded timestamp.
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> min * MICROSECONDS_PER_MILLISECOND, max -> (max + 1) * MICROSECONDS_PER_MILLISECOND);
    } else if (type.equals(TIMESTAMP_NANOS) && columnStatistics.getTimestampStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> new LongTimestamp(min * MICROSECONDS_PER_MILLISECOND, 0), max -> new LongTimestamp((max + 1) * MICROSECONDS_PER_MILLISECOND, 0));
    } else if (type.equals(TIMESTAMP_TZ_MILLIS) && columnStatistics.getTimestampStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), value -> packDateTimeWithZone(value, UTC_KEY));
    } else if (type.equals(TIMESTAMP_TZ_MICROS) && (columnStatistics.getTimestampStatistics() != null)) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(min, 0, UTC_KEY), max -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(max, 999_000_000, UTC_KEY));
    } else if (type.equals(TIMESTAMP_TZ_NANOS) && columnStatistics.getTimestampStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(min, 0, UTC_KEY), max -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(max, 999_999_000, UTC_KEY));
    } else if (type.getJavaType() == long.class && columnStatistics.getIntegerStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics());
    } else if (type.getJavaType() == double.class && columnStatistics.getDoubleStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics());
    } else if (REAL.equals(type) && columnStatistics.getDoubleStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics(), value -> (long) floatToRawIntBits(value.floatValue()));
    }
    return Domain.create(ValueSet.all(type), hasNullValue);
}
Also used : MICROSECONDS_PER_MILLISECOND(io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND) DateType(io.trino.spi.type.DateType) TIMESTAMP_TZ_NANOS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_NANOS) LongTimestampWithTimeZone(io.trino.spi.type.LongTimestampWithTimeZone) Decimals.rescale(io.trino.spi.type.Decimals.rescale) RangeStatistics(io.trino.orc.metadata.statistics.RangeStatistics) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) Range(io.trino.spi.predicate.Range) UTC_KEY(io.trino.spi.type.TimeZoneKey.UTC_KEY) Domain(io.trino.spi.predicate.Domain) Collection(java.util.Collection) DateTimeEncoding.packDateTimeWithZone(io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone) ValueSet(io.trino.spi.predicate.ValueSet) TIMESTAMP_NANOS(io.trino.spi.type.TimestampType.TIMESTAMP_NANOS) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Optional(java.util.Optional) DateTimeEncoding.unpackMillisUtc(io.trino.spi.type.DateTimeEncoding.unpackMillisUtc) DecimalType(io.trino.spi.type.DecimalType) ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) Timestamps(io.trino.spi.type.Timestamps) Slice(io.airlift.slice.Slice) TimeType(io.trino.spi.type.TimeType) Decimals.isLongDecimal(io.trino.spi.type.Decimals.isLongDecimal) TIMESTAMP_MILLIS(io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS) Type(io.trino.spi.type.Type) BOOLEAN(io.trino.spi.type.BooleanType.BOOLEAN) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) Function(java.util.function.Function) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) TIMESTAMP_TZ_MILLIS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS) ImmutableList(com.google.common.collect.ImmutableList) Chars.truncateToLengthAndTrimSpaces(io.trino.spi.type.Chars.truncateToLengthAndTrimSpaces) Objects.requireNonNull(java.util.Objects.requireNonNull) Math.floorDiv(java.lang.Math.floorDiv) Decimals.isShortDecimal(io.trino.spi.type.Decimals.isShortDecimal) Int128(io.trino.spi.type.Int128) TIMESTAMP_TZ_MICROS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS) LongTimestamp(io.trino.spi.type.LongTimestamp) BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) TIMESTAMP_MICROS(io.trino.spi.type.TimestampType.TIMESTAMP_MICROS) VarbinaryType(io.trino.spi.type.VarbinaryType) CharType(io.trino.spi.type.CharType) BooleanStatistics(io.trino.orc.metadata.statistics.BooleanStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) OrcColumnId(io.trino.orc.metadata.OrcColumnId) LongTimestamp(io.trino.spi.type.LongTimestamp) VarcharType(io.trino.spi.type.VarcharType) BooleanStatistics(io.trino.orc.metadata.statistics.BooleanStatistics) DecimalType(io.trino.spi.type.DecimalType) CharType(io.trino.spi.type.CharType) DateType(io.trino.spi.type.DateType) TimeType(io.trino.spi.type.TimeType) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 17 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class IcebergOrcFileWriter method computeMetrics.

private static Metrics computeMetrics(MetricsConfig metricsConfig, Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics) {
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);
    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();
    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        MetricsModes.MetricsMode metricsMode = MetricsUtil.metricsMode(icebergSchema, metricsConfig, icebergId);
        if (metricsMode.equals(MetricsModes.None.get())) {
            continue;
        }
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        if (!metricsMode.equals(MetricsModes.Counts.get())) {
            toIcebergMinMax(orcColumnStats, icebergField.type(), metricsMode).ifPresent(minMax -> {
                lowerBoundsBuilder.put(icebergId, minMax.getMin());
                upperBoundsBuilder.put(icebergId, minMax.getMax());
            });
        }
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.buildOrThrow();
    Map<Integer, Long> nullCounts = nullCountsBuilder.buildOrThrow();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.buildOrThrow();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.buildOrThrow();
    return new Metrics(fileRowCount, // TODO: Add column size accounting to ORC column writers
    null, valueCounts.isEmpty() ? null : valueCounts, nullCounts.isEmpty() ? null : nullCounts, // TODO: Add nanValueCounts to ORC writer
    null, lowerBounds.isEmpty() ? null : lowerBounds, upperBounds.isEmpty() ? null : upperBounds);
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Types(org.apache.iceberg.types.Types) ByteBuffer(java.nio.ByteBuffer) ImmutableMap(com.google.common.collect.ImmutableMap) MetricsModes(org.apache.iceberg.MetricsModes) Metrics(org.apache.iceberg.Metrics) OrcType(io.trino.orc.metadata.OrcType)

Example 18 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class DecimalColumnWriter method getIndexStreams.

@Override
public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException {
    checkState(closed);
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    List<DecimalStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
    List<LongStreamCheckpoint> scaleCheckpoints = scaleStream.getCheckpoints();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        DecimalStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
        LongStreamCheckpoint scaleCheckpoint = scaleCheckpoints.get(groupId);
        Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        List<Integer> positions = createDecimalColumnPositionList(compressed, dataCheckpoint, scaleCheckpoint, presentCheckpoint);
        rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
    }
    Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
    Stream stream = new Stream(columnId, StreamKind.ROW_INDEX, slice.length(), false);
    return ImmutableList.of(new StreamDataOutput(slice, stream));
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) BooleanStreamCheckpoint(io.trino.orc.checkpoint.BooleanStreamCheckpoint) ImmutableList(com.google.common.collect.ImmutableList) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) LongStreamCheckpoint(io.trino.orc.checkpoint.LongStreamCheckpoint) DecimalStreamCheckpoint(io.trino.orc.checkpoint.DecimalStreamCheckpoint) BooleanStreamCheckpoint(io.trino.orc.checkpoint.BooleanStreamCheckpoint) LongStreamCheckpoint(io.trino.orc.checkpoint.LongStreamCheckpoint) DecimalStreamCheckpoint(io.trino.orc.checkpoint.DecimalStreamCheckpoint) RowGroupIndex(io.trino.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) PresentOutputStream(io.trino.orc.stream.PresentOutputStream) DecimalOutputStream(io.trino.orc.stream.DecimalOutputStream) Stream(io.trino.orc.metadata.Stream) LongOutputStream(io.trino.orc.stream.LongOutputStream)

Example 19 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class DoubleColumnWriter method getIndexStreams.

@Override
public List<StreamDataOutput> getIndexStreams(CompressedMetadataWriter metadataWriter) throws IOException {
    checkState(closed);
    ImmutableList.Builder<RowGroupIndex> rowGroupIndexes = ImmutableList.builder();
    List<DoubleStreamCheckpoint> dataCheckpoints = dataStream.getCheckpoints();
    Optional<List<BooleanStreamCheckpoint>> presentCheckpoints = presentStream.getCheckpoints();
    for (int i = 0; i < rowGroupColumnStatistics.size(); i++) {
        int groupId = i;
        ColumnStatistics columnStatistics = rowGroupColumnStatistics.get(groupId);
        DoubleStreamCheckpoint dataCheckpoint = dataCheckpoints.get(groupId);
        Optional<BooleanStreamCheckpoint> presentCheckpoint = presentCheckpoints.map(checkpoints -> checkpoints.get(groupId));
        List<Integer> positions = createDoubleColumnPositionList(compressed, dataCheckpoint, presentCheckpoint);
        rowGroupIndexes.add(new RowGroupIndex(positions, columnStatistics));
    }
    Slice slice = metadataWriter.writeRowIndexes(rowGroupIndexes.build());
    Stream stream = new Stream(columnId, StreamKind.ROW_INDEX, slice.length(), false);
    return ImmutableList.of(new StreamDataOutput(slice, stream));
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) BooleanStreamCheckpoint(io.trino.orc.checkpoint.BooleanStreamCheckpoint) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) BooleanStreamCheckpoint(io.trino.orc.checkpoint.BooleanStreamCheckpoint) DoubleStreamCheckpoint(io.trino.orc.checkpoint.DoubleStreamCheckpoint) RowGroupIndex(io.trino.orc.metadata.RowGroupIndex) Slice(io.airlift.slice.Slice) DoubleStreamCheckpoint(io.trino.orc.checkpoint.DoubleStreamCheckpoint) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) List(java.util.List) PresentOutputStream(io.trino.orc.stream.PresentOutputStream) DoubleOutputStream(io.trino.orc.stream.DoubleOutputStream) Stream(io.trino.orc.metadata.Stream)

Example 20 with ColumnStatistics

use of io.trino.orc.metadata.statistics.ColumnStatistics in project trino by trinodb.

the class DoubleColumnWriter method finishRowGroup.

@Override
public Map<OrcColumnId, ColumnStatistics> finishRowGroup() {
    checkState(!closed);
    ColumnStatistics statistics = statisticsBuilder.buildColumnStatistics();
    rowGroupColumnStatistics.add(statistics);
    statisticsBuilder = statisticsBuilderSupplier.get();
    return ImmutableMap.of(columnId, statistics);
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics)

Aggregations

ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)45 Slice (io.airlift.slice.Slice)23 Stream (io.trino.orc.metadata.Stream)23 StreamDataOutput (io.trino.orc.stream.StreamDataOutput)20 ArrayList (java.util.ArrayList)20 ImmutableList (com.google.common.collect.ImmutableList)19 List (java.util.List)19 PresentOutputStream (io.trino.orc.stream.PresentOutputStream)17 RowGroupIndex (io.trino.orc.metadata.RowGroupIndex)15 BooleanStreamCheckpoint (io.trino.orc.checkpoint.BooleanStreamCheckpoint)12 OrcColumnId (io.trino.orc.metadata.OrcColumnId)12 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)9 LongOutputStream (io.trino.orc.stream.LongOutputStream)9 BloomFilter (io.trino.orc.metadata.statistics.BloomFilter)8 ImmutableMap (com.google.common.collect.ImmutableMap)7 LongStreamCheckpoint (io.trino.orc.checkpoint.LongStreamCheckpoint)7 ColumnEncoding (io.trino.orc.metadata.ColumnEncoding)5 ColumnMetadata (io.trino.orc.metadata.ColumnMetadata)5 StripeFooter (io.trino.orc.metadata.StripeFooter)5 LongOutputStream.createLengthOutputStream (io.trino.orc.stream.LongOutputStream.createLengthOutputStream)5