Examples with PartitionStatistics - io.trino.plugin.hive.PartitionStatistics

Example 1 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class SemiTransactionalHiveMetastore method setTableStatistics.

// TODO: Allow updating statistics for 2 tables in the same transaction
public synchronized void setTableStatistics(Table table, PartitionStatistics tableStatistics) {
    AcidTransaction transaction = currentHiveTransaction.isPresent() ? currentHiveTransaction.get().getTransaction() : NO_ACID_TRANSACTION;
    setExclusive((delegate, hdfsEnvironment) -> delegate.updateTableStatistics(table.getDatabaseName(), table.getTableName(), transaction, statistics -> updatePartitionStatistics(statistics, tableStatistics)));
}

Also used : TableInvalidationCallback(io.trino.plugin.hive.TableInvalidationCallback) HiveUpdateProcessor(io.trino.plugin.hive.HiveUpdateProcessor) FileSystem(org.apache.hadoop.fs.FileSystem) USER(io.trino.spi.security.PrincipalType.USER) MetastoreConf.getTimeVar(org.apache.hadoop.hive.metastore.conf.MetastoreConf.getTimeVar) NO_ACID_TRANSACTION(io.trino.plugin.hive.acid.AcidTransaction.NO_ACID_TRANSACTION) FileStatus(org.apache.hadoop.fs.FileStatus) ColumnStatisticType(io.trino.spi.statistics.ColumnStatisticType) NOT_SUPPORTED(io.trino.spi.StandardErrorCode.NOT_SUPPORTED) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) PRESTO_QUERY_ID_NAME(io.trino.plugin.hive.HiveMetadata.PRESTO_QUERY_ID_NAME) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) GuardedBy(javax.annotation.concurrent.GuardedBy) HiveWriteUtils.pathExists(io.trino.plugin.hive.util.HiveWriteUtils.pathExists) MANAGED_TABLE(org.apache.hadoop.hive.metastore.TableType.MANAGED_TABLE) SchemaTableName(io.trino.spi.connector.SchemaTableName) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) OWNERSHIP(io.trino.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege.OWNERSHIP) SqlStandardAccessControlMetadataMetastore(io.trino.plugin.hive.security.SqlStandardAccessControlMetadataMetastore) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Joiner(com.google.common.base.Joiner) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) HIVE_TABLE_DROPPED_DURING_QUERY(io.trino.plugin.hive.HiveErrorCode.HIVE_TABLE_DROPPED_DURING_QUERY) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Iterables(com.google.common.collect.Iterables) PartitionNotFoundException(io.trino.plugin.hive.PartitionNotFoundException) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) DataOperationType(org.apache.hadoop.hive.metastore.api.DataOperationType) HiveType(io.trino.plugin.hive.HiveType) OptionalLong(java.util.OptionalLong) Lists(com.google.common.collect.Lists) HiveTableHandle(io.trino.plugin.hive.HiveTableHandle) FormatMethod(com.google.errorprone.annotations.FormatMethod) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) LinkedHashSet(java.util.LinkedHashSet) PartitionAndStatementId(io.trino.plugin.hive.PartitionAndStatementId) ViewReaderUtil.isPrestoView(io.trino.plugin.hive.ViewReaderUtil.isPrestoView) Executor(java.util.concurrent.Executor) FileUtils.makePartName(org.apache.hadoop.hive.common.FileUtils.makePartName) PrincipalType(io.trino.spi.security.PrincipalType) TRANSACTION_CONFLICT(io.trino.spi.StandardErrorCode.TRANSACTION_CONFLICT) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) MoreFutures.getFutureValue(io.airlift.concurrent.MoreFutures.getFutureValue) RoleGrant(io.trino.spi.security.RoleGrant) HiveMetastoreClosure(io.trino.plugin.hive.HiveMetastoreClosure) ValidTxnWriteIdList(org.apache.hadoop.hive.common.ValidTxnWriteIdList) HivePrivilege(io.trino.plugin.hive.metastore.HivePrivilegeInfo.HivePrivilege) NUM_ROWS(io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.NUM_ROWS) ScheduledFuture(java.util.concurrent.ScheduledFuture) HiveUtil.toPartitionValues(io.trino.plugin.hive.util.HiveUtil.toPartitionValues) AcidOperation(io.trino.plugin.hive.acid.AcidOperation) SchemaNotFoundException(io.trino.spi.connector.SchemaNotFoundException) Duration(io.airlift.units.Duration) Statistics.merge(io.trino.plugin.hive.util.Statistics.merge) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) ALREADY_EXISTS(io.trino.spi.StandardErrorCode.ALREADY_EXISTS) Path(org.apache.hadoop.fs.Path) HIVE_FILESYSTEM_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_FILESYSTEM_ERROR) Statistics.reduce(io.trino.plugin.hive.util.Statistics.reduce) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) Collection(java.util.Collection) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) TableAlreadyExistsException(io.trino.plugin.hive.TableAlreadyExistsException) TrinoException(io.trino.spi.TrinoException) HIVE_PATH_ALREADY_EXISTS(io.trino.plugin.hive.HiveErrorCode.HIVE_PATH_ALREADY_EXISTS) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) HdfsContext(io.trino.plugin.hive.HdfsEnvironment.HdfsContext) List(java.util.List) MetastoreUtil.buildInitialPrivilegeSet(io.trino.plugin.hive.metastore.MetastoreUtil.buildInitialPrivilegeSet) Optional(java.util.Optional) Pattern(java.util.regex.Pattern) WriteMode(io.trino.plugin.hive.LocationHandle.WriteMode) AcidUtils(org.apache.hadoop.hive.ql.io.AcidUtils) DIRECT_TO_TARGET_NEW_DIRECTORY(io.trino.plugin.hive.LocationHandle.WriteMode.DIRECT_TO_TARGET_NEW_DIRECTORY) Logger(io.airlift.log.Logger) Type(io.trino.spi.type.Type) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) Function(java.util.function.Function) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) HIVE_METASTORE_ERROR(io.trino.plugin.hive.HiveErrorCode.HIVE_METASTORE_ERROR) Objects.requireNonNull(java.util.Objects.requireNonNull) LinkedList(java.util.LinkedList) Iterator(java.util.Iterator) HiveWriteUtils.isFileCreatedByQuery(io.trino.plugin.hive.util.HiveWriteUtils.isFileCreatedByQuery) TupleDomain(io.trino.spi.predicate.TupleDomain) HiveWriteUtils.createDirectory(io.trino.plugin.hive.util.HiveWriteUtils.createDirectory) SUBTRACT(io.trino.plugin.hive.util.Statistics.ReduceOperator.SUBTRACT) TXN_TIMEOUT(org.apache.hadoop.hive.metastore.conf.MetastoreConf.ConfVars.TXN_TIMEOUT) VisibleForTesting(com.google.common.annotations.VisibleForTesting) AcidTransaction(io.trino.plugin.hive.acid.AcidTransaction)

Example 2 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class GlueHiveMetastore method updateTableStatistics.

@Override
public void updateTableStatistics(String databaseName, String tableName, AcidTransaction transaction, Function<PartitionStatistics, PartitionStatistics> update) {
    Table table = getExistingTable(databaseName, tableName);
    if (transaction.isAcidTransactionRunning()) {
        table = Table.builder(table).setWriteId(OptionalLong.of(transaction.getWriteId())).build();
    }
    PartitionStatistics currentStatistics = getTableStatistics(table);
    PartitionStatistics updatedStatistics = update.apply(currentStatistics);
    try {
        TableInput tableInput = GlueInputConverter.convertTable(table);
        final Map<String, String> statisticsParameters = updateStatisticsParameters(table.getParameters(), updatedStatistics.getBasicStatistics());
        tableInput.setParameters(statisticsParameters);
        table = Table.builder(table).setParameters(statisticsParameters).build();
        stats.getUpdateTable().call(() -> glueClient.updateTable(new UpdateTableRequest().withCatalogId(catalogId).withDatabaseName(databaseName).withTableInput(tableInput)));
        columnStatisticsProvider.updateTableColumnStatistics(table, updatedStatistics.getColumnStatistics());
    } catch (EntityNotFoundException e) {
        throw new TableNotFoundException(new SchemaTableName(databaseName, tableName));
    } catch (AmazonServiceException e) {
        throw new TrinoException(HIVE_METASTORE_ERROR, e);
    }
}

Also used : TableInput(com.amazonaws.services.glue.model.TableInput) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) Table(io.trino.plugin.hive.metastore.Table) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) UpdateTableRequest(com.amazonaws.services.glue.model.UpdateTableRequest) AmazonServiceException(com.amazonaws.AmazonServiceException) TrinoException(io.trino.spi.TrinoException) EntityNotFoundException(com.amazonaws.services.glue.model.EntityNotFoundException) SchemaTableName(io.trino.spi.connector.SchemaTableName)

Example 3 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class FileHiveMetastore method getTableStatistics.

private synchronized PartitionStatistics getTableStatistics(String databaseName, String tableName) {
    Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName);
    TableMetadata tableMetadata = readSchemaFile(TABLE, tableMetadataDirectory, tableCodec).orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName)));
    checkVersion(tableMetadata.getWriterVersion());
    HiveBasicStatistics basicStatistics = getHiveBasicStatistics(tableMetadata.getParameters());
    Map<String, HiveColumnStatistics> columnStatistics = tableMetadata.getColumnStatistics();
    return new PartitionStatistics(basicStatistics, columnStatistics);
}

Also used : Path(org.apache.hadoop.fs.Path) TableNotFoundException(io.trino.spi.connector.TableNotFoundException) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) ThriftMetastoreUtil.getHiveBasicStatistics(io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) SchemaTableName(io.trino.spi.connector.SchemaTableName)

Example 4 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class ThriftHiveMetastore method getTableStatistics.

@Override
public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) {
    List<String> dataColumns = table.getSd().getCols().stream().map(FieldSchema::getName).collect(toImmutableList());
    HiveBasicStatistics basicStatistics = getHiveBasicStatistics(table.getParameters());
    Map<String, HiveColumnStatistics> columnStatistics = getTableColumnStatistics(identity, table.getDbName(), table.getTableName(), dataColumns, basicStatistics.getRowCount());
    return new PartitionStatistics(basicStatistics, columnStatistics);
}

Also used : PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) ThriftMetastoreUtil.getHiveBasicStatistics(io.trino.plugin.hive.metastore.thrift.ThriftMetastoreUtil.getHiveBasicStatistics) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics)

Example 5 with PartitionStatistics

use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.

the class MetastoreHiveStatisticsProvider method calculateDataSize.

@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
    List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
        if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
            return false;
        }
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        if (columnStatistics == null) {
            return false;
        }
        return columnStatistics.getTotalSizeInBytes().isPresent();
    }).collect(toImmutableList());
    if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
        return Estimate.unknown();
    }
    long knownRowCount = 0;
    long knownDataSize = 0;
    for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
        long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
        verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
        HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
        verifyNotNull(columnStatistics, "columnStatistics is null");
        long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
        verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
        knownRowCount += rowCount;
        knownDataSize += dataSize;
    }
    if (totalRowCount == 0) {
        return Estimate.zero();
    }
    if (knownRowCount == 0) {
        return Estimate.unknown();
    }
    double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
    return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}

Also used : DateStatistics(io.trino.plugin.hive.metastore.DateStatistics) Arrays(java.util.Arrays) Collections.unmodifiableList(java.util.Collections.unmodifiableList) BigDecimal(java.math.BigDecimal) StatsUtil.toStatsRepresentation(io.trino.spi.statistics.StatsUtil.toStatsRepresentation) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Maps.immutableEntry(com.google.common.collect.Maps.immutableEntry) Map(java.util.Map) HIVE_CORRUPTED_COLUMN_STATISTICS(io.trino.plugin.hive.HiveErrorCode.HIVE_CORRUPTED_COLUMN_STATISTICS) HiveColumnHandle(io.trino.plugin.hive.HiveColumnHandle) INTEGER(io.trino.spi.type.IntegerType.INTEGER) SMALLINT(io.trino.spi.type.SmallintType.SMALLINT) HiveBasicStatistics(io.trino.plugin.hive.HiveBasicStatistics) ImmutableMap(com.google.common.collect.ImmutableMap) HivePartition(io.trino.plugin.hive.HivePartition) Collection(java.util.Collection) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) HiveSessionProperties.isStatisticsEnabled(io.trino.plugin.hive.HiveSessionProperties.isStatisticsEnabled) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) DoubleStream(java.util.stream.DoubleStream) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) BIGINT(io.trino.spi.type.BigintType.BIGINT) LocalDate(java.time.LocalDate) Optional(java.util.Optional) HashFunction(com.google.common.hash.HashFunction) DecimalType(io.trino.spi.type.DecimalType) DATE(io.trino.spi.type.DateType.DATE) REAL(io.trino.spi.type.RealType.REAL) MoreObjects.toStringHelper(com.google.common.base.MoreObjects.toStringHelper) DoubleRange(io.trino.spi.statistics.DoubleRange) Verify.verifyNotNull(com.google.common.base.Verify.verifyNotNull) HiveSessionProperties.isIgnoreCorruptedStatistics(io.trino.plugin.hive.HiveSessionProperties.isIgnoreCorruptedStatistics) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) Slice(io.airlift.slice.Slice) Logger(io.airlift.log.Logger) NullableValue(io.trino.spi.predicate.NullableValue) HiveSessionProperties.getPartitionStatisticsSampleSize(io.trino.plugin.hive.HiveSessionProperties.getPartitionStatisticsSampleSize) Type(io.trino.spi.type.Type) OptionalDouble(java.util.OptionalDouble) Shorts(com.google.common.primitives.Shorts) UNPARTITIONED_ID(io.trino.plugin.hive.HivePartition.UNPARTITIONED_ID) ArrayList(java.util.ArrayList) VarcharType(io.trino.spi.type.VarcharType) OptionalLong(java.util.OptionalLong) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) Verify.verify(com.google.common.base.Verify.verify) SemiTransactionalHiveMetastore(io.trino.plugin.hive.metastore.SemiTransactionalHiveMetastore) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) TableStatistics(io.trino.spi.statistics.TableStatistics) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) Double.isFinite(java.lang.Double.isFinite) IntegerStatistics(io.trino.plugin.hive.metastore.IntegerStatistics) Estimate(io.trino.spi.statistics.Estimate) VerifyException(com.google.common.base.VerifyException) ColumnStatistics(io.trino.spi.statistics.ColumnStatistics) SignedBytes(com.google.common.primitives.SignedBytes) DecimalStatistics(io.trino.plugin.hive.metastore.DecimalStatistics) ConnectorSession(io.trino.spi.connector.ConnectorSession) DoubleStatistics(io.trino.plugin.hive.metastore.DoubleStatistics) Hashing.murmur3_128(com.google.common.hash.Hashing.murmur3_128) Ints(com.google.common.primitives.Ints) DOUBLE(io.trino.spi.type.DoubleType.DOUBLE) Double.isNaN(java.lang.Double.isNaN) CharType(io.trino.spi.type.CharType) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TINYINT(io.trino.spi.type.TinyintType.TINYINT) Comparator(java.util.Comparator) PartitionStatistics(io.trino.plugin.hive.PartitionStatistics) VerifyException(com.google.common.base.VerifyException) HiveColumnStatistics(io.trino.plugin.hive.metastore.HiveColumnStatistics) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

PartitionStatistics (io.trino.plugin.hive.PartitionStatistics)36 SchemaTableName (io.trino.spi.connector.SchemaTableName)21 HiveBasicStatistics (io.trino.plugin.hive.HiveBasicStatistics)16 HiveColumnStatistics (io.trino.plugin.hive.metastore.HiveColumnStatistics)16 TrinoException (io.trino.spi.TrinoException)15 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)13 ImmutableMap (com.google.common.collect.ImmutableMap)13 List (java.util.List)12 Map (java.util.Map)11 OptionalLong (java.util.OptionalLong)11 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)10 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)10 TableNotFoundException (io.trino.spi.connector.TableNotFoundException)10 Type (io.trino.spi.type.Type)10 ArrayList (java.util.ArrayList)10 Objects.requireNonNull (java.util.Objects.requireNonNull)10 Optional (java.util.Optional)10 Set (java.util.Set)10 ImmutableList (com.google.common.collect.ImmutableList)9 PartitionNotFoundException (io.trino.plugin.hive.PartitionNotFoundException)8