Search in sources :

Example 1 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method calculateDataSize.

@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
        if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getTotalSizeInBytes().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
        return Estimate.unknown();
    }
    long knownRowCount = 0;
    long knownDataSize = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verifyNotNull(columnStatistics, "columnStatistics is null");
        long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
        verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
        knownRowCount += rowCount;
        knownDataSize += dataSize;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    if (knownRowCount == 0) {
        return Estimate.unknown();
    }
    double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
    return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
Also used : DateStatistics(io.trino.plugin.hive.metastore.DateStatistics) Arrays(java.util.Arrays) Collections.unmodifiableList(java.util.Collections.unmodifiableList) BigDecimal(java.math.BigDecimal) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) HivePartition(io.trino.plugin.hive.HivePartition) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isStatisticsEnabled(io.trino.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) DoubleStream(java.util.stream.DoubleStream) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) DecimalType(io.trino.spi.type.DecimalType) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) DoubleRange(io.trino.spi.statistics.DoubleRange) Verify.verifyNotNull(com.google.common.base.Verify.verifyNotNull) HiveSessionProperties.isIgnoreCorruptedStatistics(io.trino.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) NullableValue(io.trino.spi.predicate.NullableValue) HiveSessionProperties.getPartitionStatisticsSampleSize(io.trino.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) Type(io.trino.spi.type.Type) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) UNPARTITIONED_ID(io.trino.plugin.hive.HivePartition.UNPARTITIONED_ID) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) OptionalLong(java.util.OptionalLong) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) Verify.verify(com.google.common.base.Verify.verify) SemiTransactionalHiveMetastore(io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Double.isFinite(java.lang.Double.isFinite) IntegerStatistics(io.trino.plugin.hive.metastore.IntegerStatistics) Estimate(io.trino.spi.statistics.Estimate) VerifyException(com.google.common.base.VerifyException) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) SignedBytes(com.google.common.primitives.SignedBytes) DecimalStatistics(io.trino.plugin.hive.metastore.DecimalStatistics) ConnectorSession(io.trino.spi.connector.ConnectorSession) DoubleStatistics(io.trino.plugin.hive.metastore.DoubleStatistics) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Double.isNaN(java.lang.Double.isNaN) CharType(io.trino.spi.type.CharType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) Comparator(java.util.Comparator) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method createZeroStatistics.

private TableStatistics createZeroStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes) {
    TableStatistics.Builder result = TableStatistics.builder();
    result.setRowCount(Estimate.of(0));
    columns.forEach((columnName, columnHandle) -> {
        Type columnType = columnTypes.get(columnName);
        verifyNotNull(columnType, "columnType is missing for column: %s", columnName);
        ColumnStatistics.Builder columnStatistics = ColumnStatistics.builder();
        columnStatistics.setNullsFraction(Estimate.of(0));
        columnStatistics.setDistinctValuesCount(Estimate.of(0));
        if (hasDataSize(columnType)) {
            columnStatistics.setDataSize(Estimate.of(0));
        }
        result.setColumnStatistics(columnHandle, columnStatistics.build());
    });
    return result.build();
}
Also used : HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) DecimalType(io.trino.spi.type.DecimalType) Type(io.trino.spi.type.Type) VarcharType(io.trino.spi.type.VarcharType) CharType(io.trino.spi.type.CharType) TableStatistics(io.trino.spi.statistics.TableStatistics)

Example 3 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method testStatisticsNaNWithMultipleFiles.

@Test
public void testStatisticsNaNWithMultipleFiles() {
    // Stats with NaN values cannot be used. This transaction combines a file with NaN min/max values with one with 0.0 min/max values
    DeltaLakeTableHandle tableHandle = registerTable("nan_multi_file");
    TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
    ColumnStatistics columnStatistics = stats.getColumnStatistics().get(COLUMN_HANDLE);
    assertEquals(columnStatistics.getRange(), Optional.empty());
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Test(org.testng.annotations.Test)

Example 4 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method testStatisticsZeroAndNegativeInfinity.

@Test
public void testStatisticsZeroAndNegativeInfinity() {
    DeltaLakeTableHandle tableHandle = registerTable("zero_negative_infinity");
    TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
    ColumnStatistics columnStatistics = stats.getColumnStatistics().get(COLUMN_HANDLE);
    assertEquals(columnStatistics.getRange().get().getMin(), NEGATIVE_INFINITY);
    assertEquals(columnStatistics.getRange().get().getMax(), 0.0);
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Test(org.testng.annotations.Test)

Example 5 with ColumnStatistics

use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.

the class TestDeltaLakeMetastoreStatistics method testStatisticsZeroAndNaN.

@Test
public void testStatisticsZeroAndNaN() {
    // Stats with NaN values cannot be used
    DeltaLakeTableHandle tableHandle = registerTable("zero_nan");
    TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
    ColumnStatistics columnStatistics = stats.getColumnStatistics().get(COLUMN_HANDLE);
    assertEquals(columnStatistics.getRange().get().getMin(), 0.0);
    assertEquals(columnStatistics.getRange().get().getMax(), POSITIVE_INFINITY);
}
Also used : ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Test(org.testng.annotations.Test)

Aggregations

ColumnStatistics (io.trino.spi.statistics.ColumnStatistics)24 TableStatistics (io.trino.spi.statistics.TableStatistics)23 Test (org.testng.annotations.Test)15 DeltaLakeTableHandle (io.trino.plugin.deltalake.DeltaLakeTableHandle)14 ColumnHandle (io.trino.spi.connector.ColumnHandle)10 Type (io.trino.spi.type.Type)6 ImmutableMap (com.google.common.collect.ImmutableMap)5 HiveColumnStatistics (io.trino.plugin.hive.metastore.HiveColumnStatistics)5 CharType (io.trino.spi.type.CharType)5 VarcharType (io.trino.spi.type.VarcharType)5 Map (java.util.Map)5 SchemaTableName (io.trino.spi.connector.SchemaTableName)4 DecimalType (io.trino.spi.type.DecimalType)4 Objects.requireNonNull (java.util.Objects.requireNonNull)4 MoreObjects.toStringHelper (com.google.common.base.MoreObjects.toStringHelper)3 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)3 Preconditions.checkState (com.google.common.base.Preconditions.checkState)3 Verify.verify (com.google.common.base.Verify.verify)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)3