Search in sources :

Example 1 with BooleanStatistics

use of io.trino.orc.metadata.statistics.BooleanStatistics in project trino by trinodb.

the class TupleDomainOrcPredicate method getDomain.

@VisibleForTesting
public static Domain getDomain(Type type, long rowCount, ColumnStatistics columnStatistics) {
    if (rowCount == 0) {
        return Domain.none(type);
    }
    if (columnStatistics == null) {
        return Domain.all(type);
    }
    if (columnStatistics.hasNumberOfValues() && columnStatistics.getNumberOfValues() == 0) {
        return Domain.onlyNull(type);
    }
    boolean hasNullValue = columnStatistics.getNumberOfValues() != rowCount;
    if (type instanceof TimeType && columnStatistics.getIntegerStatistics() != null) {
        // This is the representation of TIME used by Iceberg
        return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics(), value -> ((long) value) * Timestamps.PICOSECONDS_PER_MICROSECOND);
    }
    if (type.getJavaType() == boolean.class && columnStatistics.getBooleanStatistics() != null) {
        BooleanStatistics booleanStatistics = columnStatistics.getBooleanStatistics();
        boolean hasTrueValues = (booleanStatistics.getTrueValueCount() != 0);
        boolean hasFalseValues = (columnStatistics.getNumberOfValues() != booleanStatistics.getTrueValueCount());
        if (hasTrueValues && hasFalseValues) {
            return Domain.all(BOOLEAN);
        }
        if (hasTrueValues) {
            return Domain.create(ValueSet.of(BOOLEAN, true), hasNullValue);
        }
        if (hasFalseValues) {
            return Domain.create(ValueSet.of(BOOLEAN, false), hasNullValue);
        }
    } else if (isShortDecimal(type) && columnStatistics.getDecimalStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> rescale(value, (DecimalType) type).unscaledValue().longValue());
    } else if (isLongDecimal(type) && columnStatistics.getDecimalStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDecimalStatistics(), value -> Int128.valueOf(rescale(value, (DecimalType) type).unscaledValue()));
    } else if (type instanceof CharType && columnStatistics.getStringStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getStringStatistics(), value -> truncateToLengthAndTrimSpaces(value, type));
    } else if (type instanceof VarcharType && columnStatistics.getStringStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getStringStatistics());
    } else if (type instanceof DateType && columnStatistics.getDateStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDateStatistics(), value -> (long) value);
    } else if ((type.equals(TIMESTAMP_MILLIS) || type.equals(TIMESTAMP_MICROS)) && columnStatistics.getTimestampStatistics() != null) {
        // upper bound of the domain we create must be adjusted accordingly, to includes the rounded timestamp.
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> min * MICROSECONDS_PER_MILLISECOND, max -> (max + 1) * MICROSECONDS_PER_MILLISECOND);
    } else if (type.equals(TIMESTAMP_NANOS) && columnStatistics.getTimestampStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> new LongTimestamp(min * MICROSECONDS_PER_MILLISECOND, 0), max -> new LongTimestamp((max + 1) * MICROSECONDS_PER_MILLISECOND, 0));
    } else if (type.equals(TIMESTAMP_TZ_MILLIS) && columnStatistics.getTimestampStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), value -> packDateTimeWithZone(value, UTC_KEY));
    } else if (type.equals(TIMESTAMP_TZ_MICROS) && (columnStatistics.getTimestampStatistics() != null)) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(min, 0, UTC_KEY), max -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(max, 999_000_000, UTC_KEY));
    } else if (type.equals(TIMESTAMP_TZ_NANOS) && columnStatistics.getTimestampStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getTimestampStatistics(), min -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(min, 0, UTC_KEY), max -> LongTimestampWithTimeZone.fromEpochMillisAndFraction(max, 999_999_000, UTC_KEY));
    } else if (type.getJavaType() == long.class && columnStatistics.getIntegerStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getIntegerStatistics());
    } else if (type.getJavaType() == double.class && columnStatistics.getDoubleStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics());
    } else if (REAL.equals(type) && columnStatistics.getDoubleStatistics() != null) {
        return createDomain(type, hasNullValue, columnStatistics.getDoubleStatistics(), value -> (long) floatToRawIntBits(value.floatValue()));
    }
    return Domain.create(ValueSet.all(type), hasNullValue);
}
Also used : MICROSECONDS_PER_MILLISECOND(io.trino.spi.type.Timestamps.MICROSECONDS_PER_MILLISECOND) DateType(io.trino.spi.type.DateType) TIMESTAMP_TZ_NANOS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_NANOS) LongTimestampWithTimeZone(io.trino.spi.type.LongTimestampWithTimeZone) Decimals.rescale(io.trino.spi.type.Decimals.rescale) RangeStatistics(io.trino.orc.metadata.statistics.RangeStatistics) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) Range(io.trino.spi.predicate.Range) UTC_KEY(io.trino.spi.type.TimeZoneKey.UTC_KEY) Domain(io.trino.spi.predicate.Domain) Collection(java.util.Collection) DateTimeEncoding.packDateTimeWithZone(io.trino.spi.type.DateTimeEncoding.packDateTimeWithZone) ValueSet(io.trino.spi.predicate.ValueSet) TIMESTAMP_NANOS(io.trino.spi.type.TimestampType.TIMESTAMP_NANOS) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) Optional(java.util.Optional) DateTimeEncoding.unpackMillisUtc(io.trino.spi.type.DateTimeEncoding.unpackMillisUtc) DecimalType(io.trino.spi.type.DecimalType) ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) Timestamps(io.trino.spi.type.Timestamps) Slice(io.airlift.slice.Slice) TimeType(io.trino.spi.type.TimeType) Decimals.isLongDecimal(io.trino.spi.type.Decimals.isLongDecimal) TIMESTAMP_MILLIS(io.trino.spi.type.TimestampType.TIMESTAMP_MILLIS) Type(io.trino.spi.type.Type) BOOLEAN(io.trino.spi.type.BooleanType.BOOLEAN) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) Function(java.util.function.Function) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) Float.floatToRawIntBits(java.lang.Float.floatToRawIntBits) TIMESTAMP_TZ_MILLIS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MILLIS) ImmutableList(com.google.common.collect.ImmutableList) Chars.truncateToLengthAndTrimSpaces(io.trino.spi.type.Chars.truncateToLengthAndTrimSpaces) Objects.requireNonNull(java.util.Objects.requireNonNull) Math.floorDiv(java.lang.Math.floorDiv) Decimals.isShortDecimal(io.trino.spi.type.Decimals.isShortDecimal) Int128(io.trino.spi.type.Int128) TIMESTAMP_TZ_MICROS(io.trino.spi.type.TimestampWithTimeZoneType.TIMESTAMP_TZ_MICROS) LongTimestamp(io.trino.spi.type.LongTimestamp) BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) TIMESTAMP_MICROS(io.trino.spi.type.TimestampType.TIMESTAMP_MICROS) VarbinaryType(io.trino.spi.type.VarbinaryType) CharType(io.trino.spi.type.CharType) BooleanStatistics(io.trino.orc.metadata.statistics.BooleanStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) OrcColumnId(io.trino.orc.metadata.OrcColumnId) LongTimestamp(io.trino.spi.type.LongTimestamp) VarcharType(io.trino.spi.type.VarcharType) BooleanStatistics(io.trino.orc.metadata.statistics.BooleanStatistics) DecimalType(io.trino.spi.type.DecimalType) CharType(io.trino.spi.type.CharType) DateType(io.trino.spi.type.DateType) TimeType(io.trino.spi.type.TimeType) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with BooleanStatistics

use of io.trino.orc.metadata.statistics.BooleanStatistics in project trino by trinodb.

the class IcebergOrcFileWriter method toIcebergMinMax.

private static Optional<IcebergMinMax> toIcebergMinMax(ColumnStatistics orcColumnStats, org.apache.iceberg.types.Type icebergType, MetricsModes.MetricsMode metricsModes) {
    BooleanStatistics booleanStatistics = orcColumnStats.getBooleanStatistics();
    if (booleanStatistics != null) {
        boolean hasTrueValues = booleanStatistics.getTrueValueCount() != 0;
        boolean hasFalseValues = orcColumnStats.getNumberOfValues() != booleanStatistics.getTrueValueCount();
        return Optional.of(new IcebergMinMax(icebergType, !hasFalseValues, hasTrueValues, metricsModes));
    }
    IntegerStatistics integerStatistics = orcColumnStats.getIntegerStatistics();
    if (integerStatistics != null) {
        Object min = integerStatistics.getMin();
        Object max = integerStatistics.getMax();
        if (min == null || max == null) {
            return Optional.empty();
        }
        if (icebergType.typeId() == org.apache.iceberg.types.Type.TypeID.INTEGER) {
            min = toIntExact((Long) min);
            max = toIntExact((Long) max);
        }
        return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes));
    }
    DoubleStatistics doubleStatistics = orcColumnStats.getDoubleStatistics();
    if (doubleStatistics != null) {
        Object min = doubleStatistics.getMin();
        Object max = doubleStatistics.getMax();
        if (min == null || max == null) {
            return Optional.empty();
        }
        if (icebergType.typeId() == org.apache.iceberg.types.Type.TypeID.FLOAT) {
            min = ((Double) min).floatValue();
            max = ((Double) max).floatValue();
        }
        return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes));
    }
    StringStatistics stringStatistics = orcColumnStats.getStringStatistics();
    if (stringStatistics != null) {
        Slice min = stringStatistics.getMin();
        Slice max = stringStatistics.getMax();
        if (min == null || max == null) {
            return Optional.empty();
        }
        return Optional.of(new IcebergMinMax(icebergType, min.toStringUtf8(), max.toStringUtf8(), metricsModes));
    }
    DateStatistics dateStatistics = orcColumnStats.getDateStatistics();
    if (dateStatistics != null) {
        Integer min = dateStatistics.getMin();
        Integer max = dateStatistics.getMax();
        if (min == null || max == null) {
            return Optional.empty();
        }
        return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes));
    }
    DecimalStatistics decimalStatistics = orcColumnStats.getDecimalStatistics();
    if (decimalStatistics != null) {
        BigDecimal min = decimalStatistics.getMin();
        BigDecimal max = decimalStatistics.getMax();
        if (min == null || max == null) {
            return Optional.empty();
        }
        min = min.setScale(((Types.DecimalType) icebergType).scale());
        max = max.setScale(((Types.DecimalType) icebergType).scale());
        return Optional.of(new IcebergMinMax(icebergType, min, max, metricsModes));
    }
    TimestampStatistics timestampStatistics = orcColumnStats.getTimestampStatistics();
    if (timestampStatistics != null) {
        Long min = timestampStatistics.getMin();
        Long max = timestampStatistics.getMax();
        if (min == null || max == null) {
            return Optional.empty();
        }
        // We are appending 999 microseconds to account for the fact that Trino ORC writer truncates timestamps.
        return Optional.of(new IcebergMinMax(icebergType, min * MICROSECONDS_PER_MILLISECOND, (max * MICROSECONDS_PER_MILLISECOND) + (MICROSECONDS_PER_MILLISECOND - 1), metricsModes));
    }
    return Optional.empty();
}
Also used : DateStatistics(io.trino.orc.metadata.statistics.DateStatistics) TimestampStatistics(io.trino.orc.metadata.statistics.TimestampStatistics) BigDecimal(java.math.BigDecimal) StringStatistics(io.trino.orc.metadata.statistics.StringStatistics) DecimalStatistics(io.trino.orc.metadata.statistics.DecimalStatistics) DoubleStatistics(io.trino.orc.metadata.statistics.DoubleStatistics) Slice(io.airlift.slice.Slice) BooleanStatistics(io.trino.orc.metadata.statistics.BooleanStatistics) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics)

Aggregations

Slice (io.airlift.slice.Slice)2 BooleanStatistics (io.trino.orc.metadata.statistics.BooleanStatistics)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 MoreObjects.toStringHelper (com.google.common.base.MoreObjects.toStringHelper)1 ImmutableList (com.google.common.collect.ImmutableList)1 ColumnMetadata (io.trino.orc.metadata.ColumnMetadata)1 OrcColumnId (io.trino.orc.metadata.OrcColumnId)1 BloomFilter (io.trino.orc.metadata.statistics.BloomFilter)1 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)1 DateStatistics (io.trino.orc.metadata.statistics.DateStatistics)1 DecimalStatistics (io.trino.orc.metadata.statistics.DecimalStatistics)1 DoubleStatistics (io.trino.orc.metadata.statistics.DoubleStatistics)1 IntegerStatistics (io.trino.orc.metadata.statistics.IntegerStatistics)1 RangeStatistics (io.trino.orc.metadata.statistics.RangeStatistics)1 StringStatistics (io.trino.orc.metadata.statistics.StringStatistics)1 TimestampStatistics (io.trino.orc.metadata.statistics.TimestampStatistics)1 Domain (io.trino.spi.predicate.Domain)1 Range (io.trino.spi.predicate.Range)1 ValueSet (io.trino.spi.predicate.ValueSet)1 BIGINT (io.trino.spi.type.BigintType.BIGINT)1