Search in sources :

Example 6 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method calculatePartitionsRowCount.

@VisibleForTesting
static Optional<PartitionsRowCount> calculatePartitionsRowCount(Collection<PartitionStatistics> statistics, int queriedPartitionsCount) {
    long[] rowCounts = statistics.stream().map(PartitionStatistics::getBasicStatistics).map(HiveBasicStatistics::getRowCount).filter(OptionalLong::isPresent).mapToLong(OptionalLong::getAsLong).peek(count -> verify(count >= 0, "count must be greater than or equal to zero")).toArray();
    int sampleSize = statistics.size();
    // Sample contains all the queried partitions, estimate avg normally
    if (rowCounts.length <= 2 || queriedPartitionsCount == sampleSize) {
        OptionalDouble averageRowsPerPartitionOptional = Arrays.stream(rowCounts).average();
        if (averageRowsPerPartitionOptional.isEmpty()) {
            return Optional.empty();
        }
        double averageRowsPerPartition = averageRowsPerPartitionOptional.getAsDouble();
        return Optional.of(new PartitionsRowCount(averageRowsPerPartition, averageRowsPerPartition * queriedPartitionsCount));
    }
    // Some partitions (e.g. __HIVE_DEFAULT_PARTITION__) may be outliers in terms of row count.
    // Excluding the min and max rowCount values from averageRowsPerPartition calculation helps to reduce the
    // possibility of errors in the extrapolated rowCount due to a couple of outliers.
    int minIndex = 0;
    int maxIndex = 0;
    long rowCountSum = rowCounts[0];
    for (int index = 1; index < rowCounts.length; index++) {
        if (rowCounts[index] < rowCounts[minIndex]) {
            minIndex = index;
        } else if (rowCounts[index] > rowCounts[maxIndex]) {
            maxIndex = index;
        }
        rowCountSum += rowCounts[index];
    }
    double averageWithoutOutliers = ((double) (rowCountSum - rowCounts[minIndex] - rowCounts[maxIndex])) / (rowCounts.length - 2);
    double rowCount = (averageWithoutOutliers * (queriedPartitionsCount - 2)) + rowCounts[minIndex] + rowCounts[maxIndex];
    return Optional.of(new PartitionsRowCount(averageWithoutOutliers, rowCount));
}
Also used : DateStatistics(io.trino.plugin.hive.metastore.DateStatistics) Arrays(java.util.Arrays) Collections.unmodifiableList(java.util.Collections.unmodifiableList) BigDecimal(java.math.BigDecimal) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) HivePartition(io.trino.plugin.hive.HivePartition) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isStatisticsEnabled(io.trino.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) DoubleStream(java.util.stream.DoubleStream) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) DecimalType(io.trino.spi.type.DecimalType) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) DoubleRange(io.trino.spi.statistics.DoubleRange) Verify.verifyNotNull(com.google.common.base.Verify.verifyNotNull) HiveSessionProperties.isIgnoreCorruptedStatistics(io.trino.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) NullableValue(io.trino.spi.predicate.NullableValue) HiveSessionProperties.getPartitionStatisticsSampleSize(io.trino.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) Type(io.trino.spi.type.Type) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) UNPARTITIONED_ID(io.trino.plugin.hive.HivePartition.UNPARTITIONED_ID) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) OptionalLong(java.util.OptionalLong) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) Verify.verify(com.google.common.base.Verify.verify) SemiTransactionalHiveMetastore(io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Double.isFinite(java.lang.Double.isFinite) IntegerStatistics(io.trino.plugin.hive.metastore.IntegerStatistics) Estimate(io.trino.spi.statistics.Estimate) VerifyException(com.google.common.base.VerifyException) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) SignedBytes(com.google.common.primitives.SignedBytes) DecimalStatistics(io.trino.plugin.hive.metastore.DecimalStatistics) ConnectorSession(io.trino.spi.connector.ConnectorSession) DoubleStatistics(io.trino.plugin.hive.metastore.DoubleStatistics) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Double.isNaN(java.lang.Double.isNaN) CharType(io.trino.spi.type.CharType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) Comparator(java.util.Comparator) OptionalLong(java.util.OptionalLong) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) OptionalDouble(java.util.OptionalDouble) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 7 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class TestHiveGlueMetastore method testInvalidColumnStatisticsMetadata.

@Test
public void testInvalidColumnStatisticsMetadata() throws Exception {
    SchemaTableName tableName = temporaryTable("test_statistics_invalid_column_metadata");
    try {
        List<ColumnMetadata> columns = List.of(new ColumnMetadata("column1", BIGINT));
        Map<String, HiveColumnStatistics> columnStatistics = Map.of("column1", INTEGER_COLUMN_STATISTICS);
        PartitionStatistics partitionStatistics = PartitionStatistics.builder().setBasicStatistics(HIVE_BASIC_STATISTICS).setColumnStatistics(columnStatistics).build();
        doCreateEmptyTable(tableName, ORC, columns);
        // set table statistics for column1
        metastore.updateTableStatistics(tableName.getSchemaName(), tableName.getTableName(), NO_ACID_TRANSACTION, actualStatistics -> {
            assertThat(actualStatistics).isEqualTo(EMPTY_TABLE_STATISTICS);
            return partitionStatistics;
        });
        Table table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName()).get();
        TableInput tableInput = GlueInputConverter.convertTable(table);
        tableInput.setParameters(ImmutableMap.<String, String>builder().putAll(tableInput.getParameters()).put("column_stats_bad_data", "bad data").buildOrThrow());
        getGlueClient().updateTable(new UpdateTableRequest().withDatabaseName(tableName.getSchemaName()).withTableInput(tableInput));
        assertThat(metastore.getTableStatistics(tableName.getSchemaName(), tableName.getTableName())).isEqualTo(partitionStatistics);
    } finally {
        dropTable(tableName);
    }
}
Also used : TableInput(com.amazonaws.services.glue.model.TableInput) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) Table(io.trino.plugin.hive.metastore.Table) HiveUtil.isIcebergTable(io.trino.plugin.hive.util.HiveUtil.isIcebergTable) HiveUtil.isDeltaLakeTable(io.trino.plugin.hive.util.HiveUtil.isDeltaLakeTable) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) UpdateTableRequest(com.amazonaws.services.glue.model.UpdateTableRequest) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) SchemaTableName(io.trino.spi.connector.SchemaTableName) Test(org.testng.annotations.Test)

Example 8 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class TestHiveGlueMetastore method testStatisticsLargeNumberOfColumns.

@Test
public void testStatisticsLargeNumberOfColumns() throws Exception {
    SchemaTableName tableName = temporaryTable("test_statistics_large_number_of_columns");
    try {
        ImmutableList.Builder<ColumnMetadata> columns = ImmutableList.builder();
        ImmutableMap.Builder<String, HiveColumnStatistics> columnStatistics = ImmutableMap.builder();
        for (int i = 1; i < 1500; ++i) {
            String columnName = "t_bigint " + i + "_" + String.join("", Collections.nCopies(240, "x"));
            columns.add(new ColumnMetadata(columnName, BIGINT));
            columnStatistics.put(columnName, createIntegerColumnStatistics(OptionalLong.of(-1000 - i), OptionalLong.of(1000 + i), OptionalLong.of(i), OptionalLong.of(2 * i)));
        }
        PartitionStatistics partitionStatistics = PartitionStatistics.builder().setBasicStatistics(HIVE_BASIC_STATISTICS).setColumnStatistics(columnStatistics.buildOrThrow()).build();
        doCreateEmptyTable(tableName, ORC, columns.build());
        testUpdateTableStatistics(tableName, EMPTY_TABLE_STATISTICS, partitionStatistics);
    } finally {
        dropTable(tableName);
    }
}
Also used : ColumnMetadata(io.trino.spi.connector.ColumnMetadata) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) SchemaTableName(io.trino.spi.connector.SchemaTableName) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.testng.annotations.Test)

Example 9 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class TestMetastoreHiveStatisticsProvider method testGetTableStatistics.

@Test
public void testGetTableStatistics() {
    String partitionName = "p1=string1/p2=1234";
    PartitionStatistics statistics = PartitionStatistics.builder().setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())).setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))).build();
    MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, table, hivePartitions) -> ImmutableMap.of(partitionName, statistics));
    HiveColumnHandle columnHandle = createBaseColumn(COLUMN, 2, HIVE_LONG, BIGINT, REGULAR, Optional.empty());
    TableStatistics expected = TableStatistics.builder().setRowCount(Estimate.of(1000)).setColumnStatistics(PARTITION_COLUMN_1, ColumnStatistics.builder().setDataSize(Estimate.of(7000)).setNullsFraction(Estimate.of(0)).setDistinctValuesCount(Estimate.of(1)).build()).setColumnStatistics(PARTITION_COLUMN_2, ColumnStatistics.builder().setRange(new DoubleRange(1234, 1234)).setNullsFraction(Estimate.of(0)).setDistinctValuesCount(Estimate.of(1)).build()).setColumnStatistics(columnHandle, ColumnStatistics.builder().setRange(new DoubleRange(-100, 100)).setNullsFraction(Estimate.of(0.5)).setDistinctValuesCount(Estimate.of(300)).build()).build();
    assertEquals(statisticsProvider.getTableStatistics(SESSION, TABLE, ImmutableMap.of("p1", PARTITION_COLUMN_1, "p2", PARTITION_COLUMN_2, COLUMN, columnHandle), ImmutableMap.of("p1", VARCHAR, "p2", BIGINT, COLUMN, BIGINT), ImmutableList.of(partition(partitionName))), expected);
}
Also used : DoubleRange(io.trino.spi.statistics.DoubleRange) MetastoreHiveStatisticsProvider.validatePartitionStatistics(io.trino.plugin.hive.statistics.MetastoreHiveStatisticsProvider.validatePartitionStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) TableStatistics(io.trino.spi.statistics.TableStatistics) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) Test(org.testng.annotations.Test)

Example 10 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class TestMetastoreHiveStatisticsProvider method testGetTableStatisticsValidationFailure.

@Test
public void testGetTableStatisticsValidationFailure() {
    PartitionStatistics corruptedStatistics = PartitionStatistics.builder().setBasicStatistics(new HiveBasicStatistics(-1, 0, 0, 0)).build();
    String partitionName = "p1=string1/p2=1234";
    MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, table, hivePartitions) -> ImmutableMap.of(partitionName, corruptedStatistics));
    assertThatThrownBy(() -> statisticsProvider.getTableStatistics(getHiveSession(new HiveConfig().setIgnoreCorruptedStatistics(false)), TABLE, ImmutableMap.of(), ImmutableMap.of(), ImmutableList.of(partition(partitionName)))).isInstanceOf(TrinoException.class).hasFieldOrPropertyWithValue("errorCode", HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode());
    assertEquals(statisticsProvider.getTableStatistics(getHiveSession(new HiveConfig().setIgnoreCorruptedStatistics(true)), TABLE, ImmutableMap.of(), ImmutableMap.of(), ImmutableList.of(partition(partitionName))), TableStatistics.empty());
}
Also used : MetastoreHiveStatisticsProvider.validatePartitionStatistics(io.trino.plugin.hive.statistics.MetastoreHiveStatisticsProvider.validatePartitionStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) TrinoException(io.trino.spi.TrinoException) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) HiveConfig(io.trino.plugin.hive.HiveConfig) Test(org.testng.annotations.Test)

Aggregations

PartitionStatistics (io.trino.plugin.hive.PartitionStatistics)36 SchemaTableName (io.trino.spi.connector.SchemaTableName)21 HiveBasicStatistics (io.trino.plugin.hive.HiveBasicStatistics)16 HiveColumnStatistics (io.trino.plugin.hive.metastore.HiveColumnStatistics)16 TrinoException (io.trino.spi.TrinoException)15 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)13 ImmutableMap (com.google.common.collect.ImmutableMap)13 List (java.util.List)12 Map (java.util.Map)11 OptionalLong (java.util.OptionalLong)11 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)10 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)10 TableNotFoundException (io.trino.spi.connector.TableNotFoundException)10 Type (io.trino.spi.type.Type)10 ArrayList (java.util.ArrayList)10 Objects.requireNonNull (java.util.Objects.requireNonNull)10 Optional (java.util.Optional)10 Set (java.util.Set)10 ImmutableList (com.google.common.collect.ImmutableList)9 PartitionNotFoundException (io.trino.plugin.hive.PartitionNotFoundException)8