Search in sources :

Example 1 with Estimate

use of io.prestosql.spi.statistics.Estimate in project hetu-core by openlookeng.

the class ColumnStatisticsData method toColumnStatistics.

public ColumnStatistics toColumnStatistics(long rowCount) {
    ColumnStatistics.Builder builder = ColumnStatistics.builder();
    builder.setDataSize(Estimate.of((double) nullsCount / (double) rowCount));
    builder.setDistinctValuesCount(Estimate.of(distinctValuesCount));
    builder.setDataSize(dataSize.map(Estimate::of).orElse(Estimate.unknown()));
    if (min.isPresent() && max.isPresent()) {
        builder.setRange(new DoubleRange((double) min.get(), (double) max.get()));
    }
    return builder.build();
}
Also used : ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) DoubleRange(io.prestosql.spi.statistics.DoubleRange) Estimate(io.prestosql.spi.statistics.Estimate)

Example 2 with Estimate

use of io.prestosql.spi.statistics.Estimate in project hetu-core by openlookeng.

the class MetastoreHiveStatisticsProvider method calculateDataSize.

@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
        if (!statistics.getBasicStatistics().getRowCount().isPresent()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getTotalSizeInBytes().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
        return Estimate.unknown();
    }
    long knownRowCount = 0;
    long knownDataSize = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verify(columnStatistics != null, "columnStatistics is null");
        long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
        verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
        knownRowCount += rowCount;
        knownDataSize += dataSize;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    if (knownRowCount == 0) {
        return Estimate.unknown();
    }
    double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
    return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
Also used : HiveBasicStatistics(io.prestosql.plugin.hive.HiveBasicStatistics) TableStatistics(io.prestosql.spi.statistics.TableStatistics) Varchars.isVarcharType(io.prestosql.spi.type.Varchars.isVarcharType) Collections.unmodifiableList(java.util.Collections.unmodifiableList) DecimalType(io.prestosql.spi.type.DecimalType) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) NullableValue(io.prestosql.spi.predicate.NullableValue) BigDecimal(java.math.BigDecimal) HiveSessionProperties.isStatisticsEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) Type(io.prestosql.spi.type.Type) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Chars.isCharType(io.prestosql.spi.type.Chars.isCharType) Double.parseDouble(java.lang.Double.parseDouble) HiveErrorCode(io.prestosql.plugin.hive.HiveErrorCode) PrestoException(io.prestosql.spi.PrestoException) HiveSessionProperties.getPartitionStatisticsSampleSize(io.prestosql.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Decimals.isLongDecimal(io.prestosql.spi.type.Decimals.isLongDecimal) TINYINT(io.prestosql.spi.type.TinyintType.TINYINT) DecimalStatistics(io.prestosql.plugin.hive.metastore.DecimalStatistics) String.format(java.lang.String.format) HivePartition(io.prestosql.plugin.hive.HivePartition) Objects(java.util.Objects) Decimals.isShortDecimal(io.prestosql.spi.type.Decimals.isShortDecimal) List(java.util.List) DoubleStatistics(io.prestosql.plugin.hive.metastore.DoubleStatistics) Table(io.prestosql.plugin.hive.metastore.Table) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) Decimals(io.prestosql.spi.type.Decimals) INTEGER(io.prestosql.spi.type.IntegerType.INTEGER) DoubleRange(io.prestosql.spi.statistics.DoubleRange) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) SchemaTableName(io.prestosql.spi.connector.SchemaTableName) UNPARTITIONED_ID(io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DOUBLE(io.prestosql.spi.type.DoubleType.DOUBLE) Double.isFinite(java.lang.Double.isFinite) DateStatistics(io.prestosql.plugin.hive.metastore.DateStatistics) DATE(io.prestosql.spi.type.DateType.DATE) REAL(io.prestosql.spi.type.RealType.REAL) SemiTransactionalHiveMetastore(io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore) ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) VerifyException(com.google.common.base.VerifyException) HiveIdentity(io.prestosql.plugin.hive.authentication.HiveIdentity) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) SignedBytes(com.google.common.primitives.SignedBytes) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) Estimate(io.prestosql.spi.statistics.Estimate) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) SMALLINT(io.prestosql.spi.type.SmallintType.SMALLINT) Double.isNaN(java.lang.Double.isNaN) IntegerStatistics(io.prestosql.plugin.hive.metastore.IntegerStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) HiveSessionProperties.isIgnoreCorruptedStatistics(io.prestosql.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 3 with Estimate

use of io.prestosql.spi.statistics.Estimate in project hetu-core by openlookeng.

the class MetastoreHiveStatisticsProvider method calculateNullsFraction.

@VisibleForTesting
static Estimate calculateNullsFraction(String column, Collection<PartitionStatistics> partitionStatistics) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream().filter(statistics -> {
        if (!statistics.getBasicStatistics().getRowCount().isPresent()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getNullsCount().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) {
        return Estimate.unknown();
    }
    long totalNullsCount = 0;
    long totalRowCount = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verify(columnStatistics != null, "columnStatistics is null");
        long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present"));
        verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero");
        verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount);
        totalNullsCount += nullsCount;
        totalRowCount += rowCount;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    verify(totalNullsCount <= totalRowCount, "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", totalNullsCount, totalRowCount);
    return Estimate.of(((double) totalNullsCount) / totalRowCount);
}
Also used : HiveBasicStatistics(io.prestosql.plugin.hive.HiveBasicStatistics) TableStatistics(io.prestosql.spi.statistics.TableStatistics) Varchars.isVarcharType(io.prestosql.spi.type.Varchars.isVarcharType) Collections.unmodifiableList(java.util.Collections.unmodifiableList) DecimalType(io.prestosql.spi.type.DecimalType) HiveColumnHandle(io.prestosql.plugin.hive.HiveColumnHandle) NullableValue(io.prestosql.spi.predicate.NullableValue) BigDecimal(java.math.BigDecimal) HiveSessionProperties.isStatisticsEnabled(io.prestosql.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ConnectorSession(io.prestosql.spi.connector.ConnectorSession) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) Type(io.prestosql.spi.type.Type) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Chars.isCharType(io.prestosql.spi.type.Chars.isCharType) Double.parseDouble(java.lang.Double.parseDouble) HiveErrorCode(io.prestosql.plugin.hive.HiveErrorCode) PrestoException(io.prestosql.spi.PrestoException) HiveSessionProperties.getPartitionStatisticsSampleSize(io.prestosql.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) Decimals.isLongDecimal(io.prestosql.spi.type.Decimals.isLongDecimal) TINYINT(io.prestosql.spi.type.TinyintType.TINYINT) DecimalStatistics(io.prestosql.plugin.hive.metastore.DecimalStatistics) String.format(java.lang.String.format) HivePartition(io.prestosql.plugin.hive.HivePartition) Objects(java.util.Objects) Decimals.isShortDecimal(io.prestosql.spi.type.Decimals.isShortDecimal) List(java.util.List) DoubleStatistics(io.prestosql.plugin.hive.metastore.DoubleStatistics) Table(io.prestosql.plugin.hive.metastore.Table) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) Decimals(io.prestosql.spi.type.Decimals) INTEGER(io.prestosql.spi.type.IntegerType.INTEGER) DoubleRange(io.prestosql.spi.statistics.DoubleRange) Float.intBitsToFloat(java.lang.Float.intBitsToFloat) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) SchemaTableName(io.prestosql.spi.connector.SchemaTableName) UNPARTITIONED_ID(io.prestosql.plugin.hive.HivePartition.UNPARTITIONED_ID) Verify.verify(com.google.common.base.Verify.verify) Objects.requireNonNull(java.util.Objects.requireNonNull) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DOUBLE(io.prestosql.spi.type.DoubleType.DOUBLE) Double.isFinite(java.lang.Double.isFinite) DateStatistics(io.prestosql.plugin.hive.metastore.DateStatistics) DATE(io.prestosql.spi.type.DateType.DATE) REAL(io.prestosql.spi.type.RealType.REAL) SemiTransactionalHiveMetastore(io.prestosql.plugin.hive.metastore.SemiTransactionalHiveMetastore) ColumnStatistics(io.prestosql.spi.statistics.ColumnStatistics) VerifyException(com.google.common.base.VerifyException) HiveIdentity(io.prestosql.plugin.hive.authentication.HiveIdentity) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) SignedBytes(com.google.common.primitives.SignedBytes) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) Estimate(io.prestosql.spi.statistics.Estimate) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) SMALLINT(io.prestosql.spi.type.SmallintType.SMALLINT) Double.isNaN(java.lang.Double.isNaN) IntegerStatistics(io.prestosql.plugin.hive.metastore.IntegerStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) HiveSessionProperties.isIgnoreCorruptedStatistics(io.prestosql.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.prestosql.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.prestosql.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

ColumnStatistics (io.prestosql.spi.statistics.ColumnStatistics)3 DoubleRange (io.prestosql.spi.statistics.DoubleRange)3 Estimate (io.prestosql.spi.statistics.Estimate)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)2 Verify.verify (com.google.common.base.Verify.verify)2 VerifyException (com.google.common.base.VerifyException)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)2 Maps.immutableEntry (com.google.common.collect.Maps.immutableEntry)2 HashFunction (com.google.common.hash.HashFunction)2 Hashing.murmur3_128 (com.google.common.hash.Hashing.murmur3_128)2 Ints (com.google.common.primitives.Ints)2 Shorts (com.google.common.primitives.Shorts)2 SignedBytes (com.google.common.primitives.SignedBytes)2 Logger (io.airlift.log.Logger)2 Slice (io.airlift.slice.Slice)2 HiveBasicStatistics (io.prestosql.plugin.hive.HiveBasicStatistics)2 HiveColumnHandle (io.prestosql.plugin.hive.HiveColumnHandle)2