Search in sources :

Example 1 with HiveBasicStatistics

use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.

the class FileHiveMetastore method getTableStatistics.

private synchronized PartitionStatistics getTableStatistics(String databaseName, String tableName) {
    Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName);
    TableMetadata tableMetadata = readSchemaFile(TABLE, tableMetadataDirectory, tableCodec).orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName)));
    checkVersion(tableMetadata.getWriterVersion());
    HiveBasicStatistics basicStatistics = getHiveBasicStatistics(tableMetadata.getParameters());
    Map<String, HiveColumnStatistics> columnStatistics = tableMetadata.getColumnStatistics();
    return new PartitionStatistics(basicStatistics, columnStatistics);
}
Also used : Path(org.apache.hadoop.fs.Path) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) ThriftMetastoreUtil.getHiveBasicStatistics(io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) SchemaTableName(io.trino.spi.connector.SchemaTableName)

Example 2 with HiveBasicStatistics

use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.

the class ThriftMetastoreUtil method getHiveBasicStatistics.

public static HiveBasicStatistics getHiveBasicStatistics(Map<String, String> parameters) {
    OptionalLong numFiles = parse(parameters.get(NUM_FILES));
    OptionalLong numRows = parse(parameters.get(NUM_ROWS));
    OptionalLong inMemoryDataSizeInBytes = parse(parameters.get(RAW_DATA_SIZE));
    OptionalLong onDiskDataSizeInBytes = parse(parameters.get(TOTAL_SIZE));
    return new HiveBasicStatistics(numFiles, numRows, inMemoryDataSizeInBytes, onDiskDataSizeInBytes);
}
Also used : OptionalLong(java.util.OptionalLong) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics)

Example 3 with HiveBasicStatistics

use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.

the class ThriftHiveMetastore method getTableStatistics.

@Override
public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) {
    List<String> dataColumns = table.getSd().getCols().stream().map(FieldSchema::getName).collect(toImmutableList());
    HiveBasicStatistics basicStatistics = getHiveBasicStatistics(table.getParameters());
    Map<String, HiveColumnStatistics> columnStatistics = getTableColumnStatistics(identity, table.getDbName(), table.getTableName(), dataColumns, basicStatistics.getRowCount());
    return new PartitionStatistics(basicStatistics, columnStatistics);
}
Also used : PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) ThriftMetastoreUtil.getHiveBasicStatistics(io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics)

Example 4 with HiveBasicStatistics

use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method calculatePartitionsRowCount.

@VisibleForTesting
static Optional<PartitionsRowCount> calculatePartitionsRowCount(Collection<PartitionStatistics> statistics, int queriedPartitionsCount) {
    long[] rowCounts = statistics.stream().map(PartitionStatistics::getBasicStatistics).map(HiveBasicStatistics::getRowCount).filter(OptionalLong::isPresent).mapToLong(OptionalLong::getAsLong).peek(count -> verify(count >= 0, "count must be greater than or equal to zero")).toArray();
    int sampleSize = statistics.size();
    // Sample contains all the queried partitions, estimate avg normally
    if (rowCounts.length <= 2 || queriedPartitionsCount == sampleSize) {
        OptionalDouble averageRowsPerPartitionOptional = Arrays.stream(rowCounts).average();
        if (averageRowsPerPartitionOptional.isEmpty()) {
            return Optional.empty();
        }
        double averageRowsPerPartition = averageRowsPerPartitionOptional.getAsDouble();
        return Optional.of(new PartitionsRowCount(averageRowsPerPartition, averageRowsPerPartition * queriedPartitionsCount));
    }
    // Some partitions (e.g. __HIVE_DEFAULT_PARTITION__) may be outliers in terms of row count.
    // Excluding the min and max rowCount values from averageRowsPerPartition calculation helps to reduce the
    // possibility of errors in the extrapolated rowCount due to a couple of outliers.
    int minIndex = 0;
    int maxIndex = 0;
    long rowCountSum = rowCounts[0];
    for (int index = 1; index < rowCounts.length; index++) {
        if (rowCounts[index] < rowCounts[minIndex]) {
            minIndex = index;
        } else if (rowCounts[index] > rowCounts[maxIndex]) {
            maxIndex = index;
        }
        rowCountSum += rowCounts[index];
    }
    double averageWithoutOutliers = ((double) (rowCountSum - rowCounts[minIndex] - rowCounts[maxIndex])) / (rowCounts.length - 2);
    double rowCount = (averageWithoutOutliers * (queriedPartitionsCount - 2)) + rowCounts[minIndex] + rowCounts[maxIndex];
    return Optional.of(new PartitionsRowCount(averageWithoutOutliers, rowCount));
}
Also used : DateStatistics(io.trino.plugin.hive.metastore.DateStatistics) Arrays(java.util.Arrays) Collections.unmodifiableList(java.util.Collections.unmodifiableList) BigDecimal(java.math.BigDecimal) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) HivePartition(io.trino.plugin.hive.HivePartition) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isStatisticsEnabled(io.trino.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) DoubleStream(java.util.stream.DoubleStream) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) DecimalType(io.trino.spi.type.DecimalType) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) DoubleRange(io.trino.spi.statistics.DoubleRange) Verify.verifyNotNull(com.google.common.base.Verify.verifyNotNull) HiveSessionProperties.isIgnoreCorruptedStatistics(io.trino.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) NullableValue(io.trino.spi.predicate.NullableValue) HiveSessionProperties.getPartitionStatisticsSampleSize(io.trino.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) Type(io.trino.spi.type.Type) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) UNPARTITIONED_ID(io.trino.plugin.hive.HivePartition.UNPARTITIONED_ID) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) OptionalLong(java.util.OptionalLong) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) Verify.verify(com.google.common.base.Verify.verify) SemiTransactionalHiveMetastore(io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Double.isFinite(java.lang.Double.isFinite) IntegerStatistics(io.trino.plugin.hive.metastore.IntegerStatistics) Estimate(io.trino.spi.statistics.Estimate) VerifyException(com.google.common.base.VerifyException) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) SignedBytes(com.google.common.primitives.SignedBytes) DecimalStatistics(io.trino.plugin.hive.metastore.DecimalStatistics) ConnectorSession(io.trino.spi.connector.ConnectorSession) DoubleStatistics(io.trino.plugin.hive.metastore.DoubleStatistics) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Double.isNaN(java.lang.Double.isNaN) CharType(io.trino.spi.type.CharType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) Comparator(java.util.Comparator) OptionalLong(java.util.OptionalLong) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) OptionalDouble(java.util.OptionalDouble) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 5 with HiveBasicStatistics

use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.

the class TestStatistics method testReduce.

@Test
public void testReduce() {
    assertThat(reduce(createEmptyStatistics(), createEmptyStatistics(), ADD)).isEqualTo(createEmptyStatistics());
    assertThat(reduce(createZeroStatistics(), createEmptyStatistics(), ADD)).isEqualTo(createEmptyStatistics());
    assertThat(reduce(createEmptyStatistics(), createZeroStatistics(), ADD)).isEqualTo(createEmptyStatistics());
    assertThat(reduce(createEmptyStatistics(), createEmptyStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics());
    assertThat(reduce(createZeroStatistics(), createEmptyStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics());
    assertThat(reduce(createEmptyStatistics(), createZeroStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics());
    assertThat(reduce(new HiveBasicStatistics(11, 9, 7, 5), new HiveBasicStatistics(1, 2, 3, 4), ADD)).isEqualTo(new HiveBasicStatistics(12, 11, 10, 9));
    assertThat(reduce(new HiveBasicStatistics(11, 9, 7, 5), new HiveBasicStatistics(1, 2, 3, 4), SUBTRACT)).isEqualTo(new HiveBasicStatistics(10, 7, 4, 1));
}
Also used : HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) Test(org.testng.annotations.Test)

Aggregations

HiveBasicStatistics (io.trino.plugin.hive.HiveBasicStatistics)22 PartitionStatistics (io.trino.plugin.hive.PartitionStatistics)13 ThriftMetastoreUtil.getHiveBasicStatistics (io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics)13 HiveColumnStatistics (io.trino.plugin.hive.metastore.HiveColumnStatistics)11 TrinoException (io.trino.spi.TrinoException)11 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)9 ImmutableMap (com.google.common.collect.ImmutableMap)9 Column (io.trino.plugin.hive.metastore.Column)8 List (java.util.List)8 Map (java.util.Map)8 ImmutableList (com.google.common.collect.ImmutableList)7 OptionalLong (java.util.OptionalLong)7 HIVE_METASTORE_ERROR (io.trino.plugin.hive.HiveErrorCode.HIVE_METASTORE_ERROR)6 ColumnStatisticType (io.trino.spi.statistics.ColumnStatisticType)6 Type (io.trino.spi.type.Type)6 ArrayList (java.util.ArrayList)6 Optional (java.util.Optional)6 Set (java.util.Set)6 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)5 Sets.difference (com.google.common.collect.Sets.difference)5