use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class SemiTransactionalHiveMetastore method setTableStatistics.
// TODO: Allow updating statistics for 2 tables in the same transaction
public synchronized void setTableStatistics(Table table, PartitionStatistics tableStatistics) {
AcidTransaction transaction = currentHiveTransaction.isPresent() ? currentHiveTransaction.get().getTransaction() : NO_ACID_TRANSACTION;
setExclusive((delegate, hdfsEnvironment) -> delegate.updateTableStatistics(table.getDatabaseName(), table.getTableName(), transaction, statistics -> updatePartitionStatistics(statistics, tableStatistics)));
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class GlueHiveMetastore method updateTableStatistics.
@Override
public void updateTableStatistics(String databaseName, String tableName, AcidTransaction transaction, Function<PartitionStatistics, PartitionStatistics> update) {
Table table = getExistingTable(databaseName, tableName);
if (transaction.isAcidTransactionRunning()) {
table = Table.builder(table).setWriteId(OptionalLong.of(transaction.getWriteId())).build();
}
PartitionStatistics currentStatistics = getTableStatistics(table);
PartitionStatistics updatedStatistics = update.apply(currentStatistics);
try {
TableInput tableInput = GlueInputConverter.convertTable(table);
final Map<String, String> statisticsParameters = updateStatisticsParameters(table.getParameters(), updatedStatistics.getBasicStatistics());
tableInput.setParameters(statisticsParameters);
table = Table.builder(table).setParameters(statisticsParameters).build();
stats.getUpdateTable().call(() -> glueClient.updateTable(new UpdateTableRequest().withCatalogId(catalogId).withDatabaseName(databaseName).withTableInput(tableInput)));
columnStatisticsProvider.updateTableColumnStatistics(table, updatedStatistics.getColumnStatistics());
} catch (EntityNotFoundException e) {
throw new TableNotFoundException(new SchemaTableName(databaseName, tableName));
} catch (AmazonServiceException e) {
throw new TrinoException(HIVE_METASTORE_ERROR, e);
}
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class FileHiveMetastore method getTableStatistics.
private synchronized PartitionStatistics getTableStatistics(String databaseName, String tableName) {
Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName);
TableMetadata tableMetadata = readSchemaFile(TABLE, tableMetadataDirectory, tableCodec).orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName)));
checkVersion(tableMetadata.getWriterVersion());
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(tableMetadata.getParameters());
Map<String, HiveColumnStatistics> columnStatistics = tableMetadata.getColumnStatistics();
return new PartitionStatistics(basicStatistics, columnStatistics);
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class ThriftHiveMetastore method getTableStatistics.
@Override
public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) {
List<String> dataColumns = table.getSd().getCols().stream().map(FieldSchema::getName).collect(toImmutableList());
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(table.getParameters());
Map<String, HiveColumnStatistics> columnStatistics = getTableColumnStatistics(identity, table.getDbName(), table.getTableName(), dataColumns, basicStatistics.getRowCount());
return new PartitionStatistics(basicStatistics, columnStatistics);
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculateDataSize.
@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
return false;
}
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
if (columnStatistics == null) {
return false;
}
return columnStatistics.getTotalSizeInBytes().isPresent();
}).collect(toImmutableList());
if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
return Estimate.unknown();
}
long knownRowCount = 0;
long knownDataSize = 0;
for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
verifyNotNull(columnStatistics, "columnStatistics is null");
long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
knownRowCount += rowCount;
knownDataSize += dataSize;
}
if (totalRowCount == 0) {
return Estimate.zero();
}
if (knownRowCount == 0) {
return Estimate.unknown();
}
double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
Aggregations