Search in sources :

Example 1 with Estimate

use of io.trino.spi.statistics.Estimate in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method calculateDataSize.

@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
        if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getTotalSizeInBytes().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
        return Estimate.unknown();
    }
    long knownRowCount = 0;
    long knownDataSize = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verifyNotNull(columnStatistics, "columnStatistics is null");
        long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
        verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
        knownRowCount += rowCount;
        knownDataSize += dataSize;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    if (knownRowCount == 0) {
        return Estimate.unknown();
    }
    double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
    return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
Also used : DateStatistics(io.trino.plugin.hive.metastore.DateStatistics) Arrays(java.util.Arrays) Collections.unmodifiableList(java.util.Collections.unmodifiableList) BigDecimal(java.math.BigDecimal) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) HivePartition(io.trino.plugin.hive.HivePartition) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isStatisticsEnabled(io.trino.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) DoubleStream(java.util.stream.DoubleStream) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) DecimalType(io.trino.spi.type.DecimalType) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) DoubleRange(io.trino.spi.statistics.DoubleRange) Verify.verifyNotNull(com.google.common.base.Verify.verifyNotNull) HiveSessionProperties.isIgnoreCorruptedStatistics(io.trino.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) NullableValue(io.trino.spi.predicate.NullableValue) HiveSessionProperties.getPartitionStatisticsSampleSize(io.trino.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) Type(io.trino.spi.type.Type) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) UNPARTITIONED_ID(io.trino.plugin.hive.HivePartition.UNPARTITIONED_ID) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) OptionalLong(java.util.OptionalLong) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) Verify.verify(com.google.common.base.Verify.verify) SemiTransactionalHiveMetastore(io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Double.isFinite(java.lang.Double.isFinite) IntegerStatistics(io.trino.plugin.hive.metastore.IntegerStatistics) Estimate(io.trino.spi.statistics.Estimate) VerifyException(com.google.common.base.VerifyException) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) SignedBytes(com.google.common.primitives.SignedBytes) DecimalStatistics(io.trino.plugin.hive.metastore.DecimalStatistics) ConnectorSession(io.trino.spi.connector.ConnectorSession) DoubleStatistics(io.trino.plugin.hive.metastore.DoubleStatistics) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Double.isNaN(java.lang.Double.isNaN) CharType(io.trino.spi.type.CharType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) Comparator(java.util.Comparator) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with Estimate

use of io.trino.spi.statistics.Estimate in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method calculatePartitionsRowCount.

@VisibleForTesting
static Optional<PartitionsRowCount> calculatePartitionsRowCount(Collection<PartitionStatistics> statistics, int queriedPartitionsCount) {
    long[] rowCounts = statistics.stream().map(PartitionStatistics::getBasicStatistics).map(HiveBasicStatistics::getRowCount).filter(OptionalLong::isPresent).mapToLong(OptionalLong::getAsLong).peek(count -> verify(count >= 0, "count must be greater than or equal to zero")).toArray();
    int sampleSize = statistics.size();
    // Sample contains all the queried partitions, estimate avg normally
    if (rowCounts.length <= 2 || queriedPartitionsCount == sampleSize) {
        OptionalDouble averageRowsPerPartitionOptional = Arrays.stream(rowCounts).average();
        if (averageRowsPerPartitionOptional.isEmpty()) {
            return Optional.empty();
        }
        double averageRowsPerPartition = averageRowsPerPartitionOptional.getAsDouble();
        return Optional.of(new PartitionsRowCount(averageRowsPerPartition, averageRowsPerPartition * queriedPartitionsCount));
    }
    // Some partitions (e.g. __HIVE_DEFAULT_PARTITION__) may be outliers in terms of row count.
    // Excluding the min and max rowCount values from averageRowsPerPartition calculation helps to reduce the
    // possibility of errors in the extrapolated rowCount due to a couple of outliers.
    int minIndex = 0;
    int maxIndex = 0;
    long rowCountSum = rowCounts[0];
    for (int index = 1; index < rowCounts.length; index++) {
        if (rowCounts[index] < rowCounts[minIndex]) {
            minIndex = index;
        } else if (rowCounts[index] > rowCounts[maxIndex]) {
            maxIndex = index;
        }
        rowCountSum += rowCounts[index];
    }
    double averageWithoutOutliers = ((double) (rowCountSum - rowCounts[minIndex] - rowCounts[maxIndex])) / (rowCounts.length - 2);
    double rowCount = (averageWithoutOutliers * (queriedPartitionsCount - 2)) + rowCounts[minIndex] + rowCounts[maxIndex];
    return Optional.of(new PartitionsRowCount(averageWithoutOutliers, rowCount));
}
Also used : DateStatistics(io.trino.plugin.hive.metastore.DateStatistics) Arrays(java.util.Arrays) Collections.unmodifiableList(java.util.Collections.unmodifiableList) BigDecimal(java.math.BigDecimal) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) HivePartition(io.trino.plugin.hive.HivePartition) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isStatisticsEnabled(io.trino.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) DoubleStream(java.util.stream.DoubleStream) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) DecimalType(io.trino.spi.type.DecimalType) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) DoubleRange(io.trino.spi.statistics.DoubleRange) Verify.verifyNotNull(com.google.common.base.Verify.verifyNotNull) HiveSessionProperties.isIgnoreCorruptedStatistics(io.trino.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) NullableValue(io.trino.spi.predicate.NullableValue) HiveSessionProperties.getPartitionStatisticsSampleSize(io.trino.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) Type(io.trino.spi.type.Type) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) UNPARTITIONED_ID(io.trino.plugin.hive.HivePartition.UNPARTITIONED_ID) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) OptionalLong(java.util.OptionalLong) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) Verify.verify(com.google.common.base.Verify.verify) SemiTransactionalHiveMetastore(io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Double.isFinite(java.lang.Double.isFinite) IntegerStatistics(io.trino.plugin.hive.metastore.IntegerStatistics) Estimate(io.trino.spi.statistics.Estimate) VerifyException(com.google.common.base.VerifyException) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) SignedBytes(com.google.common.primitives.SignedBytes) DecimalStatistics(io.trino.plugin.hive.metastore.DecimalStatistics) ConnectorSession(io.trino.spi.connector.ConnectorSession) DoubleStatistics(io.trino.plugin.hive.metastore.DoubleStatistics) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Double.isNaN(java.lang.Double.isNaN) CharType(io.trino.spi.type.CharType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) Comparator(java.util.Comparator) OptionalLong(java.util.OptionalLong) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) OptionalDouble(java.util.OptionalDouble) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with Estimate

use of io.trino.spi.statistics.Estimate in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method calculateNullsFraction.

@VisibleForTesting
static Estimate calculateNullsFraction(String column, Collection<PartitionStatistics> partitionStatistics) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream().filter(statistics -> {
        if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getNullsCount().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) {
        return Estimate.unknown();
    }
    long totalNullsCount = 0;
    long totalRowCount = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verifyNotNull(columnStatistics, "columnStatistics is null");
        long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present"));
        verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero");
        verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount);
        totalNullsCount += nullsCount;
        totalRowCount += rowCount;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    verify(totalNullsCount <= totalRowCount, "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", totalNullsCount, totalRowCount);
    return Estimate.of(((double) totalNullsCount) / totalRowCount);
}
Also used : DateStatistics(io.trino.plugin.hive.metastore.DateStatistics) Arrays(java.util.Arrays) Collections.unmodifiableList(java.util.Collections.unmodifiableList) BigDecimal(java.math.BigDecimal) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) HivePartition(io.trino.plugin.hive.HivePartition) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isStatisticsEnabled(io.trino.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) DoubleStream(java.util.stream.DoubleStream) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) DecimalType(io.trino.spi.type.DecimalType) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) DoubleRange(io.trino.spi.statistics.DoubleRange) Verify.verifyNotNull(com.google.common.base.Verify.verifyNotNull) HiveSessionProperties.isIgnoreCorruptedStatistics(io.trino.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) NullableValue(io.trino.spi.predicate.NullableValue) HiveSessionProperties.getPartitionStatisticsSampleSize(io.trino.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) Type(io.trino.spi.type.Type) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) UNPARTITIONED_ID(io.trino.plugin.hive.HivePartition.UNPARTITIONED_ID) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) OptionalLong(java.util.OptionalLong) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) Verify.verify(com.google.common.base.Verify.verify) SemiTransactionalHiveMetastore(io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Double.isFinite(java.lang.Double.isFinite) IntegerStatistics(io.trino.plugin.hive.metastore.IntegerStatistics) Estimate(io.trino.spi.statistics.Estimate) VerifyException(com.google.common.base.VerifyException) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) SignedBytes(com.google.common.primitives.SignedBytes) DecimalStatistics(io.trino.plugin.hive.metastore.DecimalStatistics) ConnectorSession(io.trino.spi.connector.ConnectorSession) DoubleStatistics(io.trino.plugin.hive.metastore.DoubleStatistics) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Double.isNaN(java.lang.Double.isNaN) CharType(io.trino.spi.type.CharType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) Comparator(java.util.Comparator) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

VisibleForTesting (com.google.common.annotations.VisibleForTesting)3 MoreObjects.toStringHelper (com.google.common.base.MoreObjects.toStringHelper)3 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)3 Preconditions.checkState (com.google.common.base.Preconditions.checkState)3 Verify.verify (com.google.common.base.Verify.verify)3 Verify.verifyNotNull (com.google.common.base.Verify.verifyNotNull)3 VerifyException (com.google.common.base.VerifyException)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 ImmutableMap (com.google.common.collect.ImmutableMap)3 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)3 Maps.immutableEntry (com.google.common.collect.Maps.immutableEntry)3 HashFunction (com.google.common.hash.HashFunction)3 Hashing.murmur3_128 (com.google.common.hash.Hashing.murmur3_128)3 Ints (com.google.common.primitives.Ints)3 Shorts (com.google.common.primitives.Shorts)3 SignedBytes (com.google.common.primitives.SignedBytes)3 Logger (io.airlift.log.Logger)3 Slice (io.airlift.slice.Slice)3 HiveBasicStatistics (io.trino.plugin.hive.HiveBasicStatistics)3 HiveColumnHandle (io.trino.plugin.hive.HiveColumnHandle)3