use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class ThriftHiveMetastore method getPartitionStatistics.
@Override
public Map<String, PartitionStatistics> getPartitionStatistics(HiveIdentity identity, Table table, List<Partition> partitions) {
List<String> dataColumns = table.getSd().getCols().stream().map(FieldSchema::getName).collect(toImmutableList());
List<String> partitionColumns = table.getPartitionKeys().stream().map(FieldSchema::getName).collect(toImmutableList());
Map<String, HiveBasicStatistics> partitionBasicStatistics = partitions.stream().collect(toImmutableMap(partition -> makePartName(partitionColumns, partition.getValues()), partition -> getHiveBasicStatistics(partition.getParameters())));
Map<String, OptionalLong> partitionRowCounts = partitionBasicStatistics.entrySet().stream().collect(toImmutableMap(Map.Entry::getKey, entry -> entry.getValue().getRowCount()));
Map<String, Map<String, HiveColumnStatistics>> partitionColumnStatistics = getPartitionColumnStatistics(identity, table.getDbName(), table.getTableName(), partitionBasicStatistics.keySet(), dataColumns, partitionRowCounts);
ImmutableMap.Builder<String, PartitionStatistics> result = ImmutableMap.builder();
for (String partitionName : partitionBasicStatistics.keySet()) {
HiveBasicStatistics basicStatistics = partitionBasicStatistics.get(partitionName);
Map<String, HiveColumnStatistics> columnStatistics = partitionColumnStatistics.getOrDefault(partitionName, ImmutableMap.of());
result.put(partitionName, new PartitionStatistics(basicStatistics, columnStatistics));
}
return result.buildOrThrow();
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class ThriftHiveMetastore method updatePartitionStatistics.
@Override
public void updatePartitionStatistics(HiveIdentity identity, Table table, String partitionName, Function<PartitionStatistics, PartitionStatistics> update) {
List<Partition> partitions = getPartitionsByNames(identity, table.getDbName(), table.getTableName(), ImmutableList.of(partitionName));
if (partitions.size() != 1) {
throw new TrinoException(HIVE_METASTORE_ERROR, "Metastore returned multiple partitions for name: " + partitionName);
}
Partition originalPartition = getOnlyElement(partitions);
PartitionStatistics currentStatistics = requireNonNull(getPartitionStatistics(identity, table, partitions).get(partitionName), "getPartitionStatistics() did not return statistics for partition");
PartitionStatistics updatedStatistics = update.apply(currentStatistics);
Partition modifiedPartition = originalPartition.deepCopy();
HiveBasicStatistics basicStatistics = updatedStatistics.getBasicStatistics();
modifiedPartition.setParameters(updateStatisticsParameters(modifiedPartition.getParameters(), basicStatistics));
alterPartitionWithoutStatistics(identity, table.getDbName(), table.getTableName(), modifiedPartition);
Map<String, HiveType> columns = modifiedPartition.getSd().getCols().stream().collect(toImmutableMap(FieldSchema::getName, schema -> HiveType.valueOf(schema.getType())));
setPartitionColumnStatistics(identity, table.getDbName(), table.getTableName(), partitionName, columns, updatedStatistics.getColumnStatistics(), basicStatistics.getRowCount());
Set<String> removedStatistics = difference(currentStatistics.getColumnStatistics().keySet(), updatedStatistics.getColumnStatistics().keySet());
removedStatistics.forEach(column -> deletePartitionColumnStatistics(identity, table.getDbName(), table.getTableName(), partitionName, column));
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method validatePartitionStatistics.
@VisibleForTesting
static void validatePartitionStatistics(SchemaTableName table, Map<String, PartitionStatistics> partitionStatistics) {
partitionStatistics.forEach((partition, statistics) -> {
HiveBasicStatistics basicStatistics = statistics.getBasicStatistics();
OptionalLong rowCount = basicStatistics.getRowCount();
rowCount.ifPresent(count -> checkStatistics(count >= 0, table, partition, "rowCount must be greater than or equal to zero: %s", count));
basicStatistics.getFileCount().ifPresent(count -> checkStatistics(count >= 0, table, partition, "fileCount must be greater than or equal to zero: %s", count));
basicStatistics.getInMemoryDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "inMemoryDataSizeInBytes must be greater than or equal to zero: %s", size));
basicStatistics.getOnDiskDataSizeInBytes().ifPresent(size -> checkStatistics(size >= 0, table, partition, "onDiskDataSizeInBytes must be greater than or equal to zero: %s", size));
statistics.getColumnStatistics().forEach((column, columnStatistics) -> validateColumnStatistics(table, partition, column, rowCount, columnStatistics));
});
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class TestMetastoreHiveStatisticsProvider method testGetTableStatisticsUnpartitioned.
@Test
public void testGetTableStatisticsUnpartitioned() {
PartitionStatistics statistics = PartitionStatistics.builder().setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())).setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))).build();
MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, table, hivePartitions) -> ImmutableMap.of(UNPARTITIONED_ID, statistics));
HiveColumnHandle columnHandle = createBaseColumn(COLUMN, 2, HIVE_LONG, BIGINT, REGULAR, Optional.empty());
TableStatistics expected = TableStatistics.builder().setRowCount(Estimate.of(1000)).setColumnStatistics(columnHandle, ColumnStatistics.builder().setRange(new DoubleRange(-100, 100)).setNullsFraction(Estimate.of(0.5)).setDistinctValuesCount(Estimate.of(300)).build()).build();
assertEquals(statisticsProvider.getTableStatistics(SESSION, TABLE, ImmutableMap.of(COLUMN, columnHandle), ImmutableMap.of(COLUMN, BIGINT), ImmutableList.of(new HivePartition(TABLE))), expected);
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class SemiTransactionalHiveMetastore method updatePartitionStatistics.
// For HiveBasicStatistics, we only overwrite the original statistics if the new one is not empty.
// For HiveColumnStatistics, only overwrite the original statistics for columns present in the new ones and preserve the others.
private static PartitionStatistics updatePartitionStatistics(PartitionStatistics oldPartitionStats, PartitionStatistics newPartitionStats) {
HiveBasicStatistics oldBasicStatistics = oldPartitionStats.getBasicStatistics();
HiveBasicStatistics newBasicStatistics = newPartitionStats.getBasicStatistics();
HiveBasicStatistics updatedBasicStatistics = new HiveBasicStatistics(firstPresent(newBasicStatistics.getFileCount(), oldBasicStatistics.getFileCount()), firstPresent(newBasicStatistics.getRowCount(), oldBasicStatistics.getRowCount()), firstPresent(newBasicStatistics.getInMemoryDataSizeInBytes(), oldBasicStatistics.getInMemoryDataSizeInBytes()), firstPresent(newBasicStatistics.getOnDiskDataSizeInBytes(), oldBasicStatistics.getOnDiskDataSizeInBytes()));
Map<String, HiveColumnStatistics> updatedColumnStatistics = updateColumnStatistics(oldPartitionStats.getColumnStatistics(), newPartitionStats.getColumnStatistics());
return new PartitionStatistics(updatedBasicStatistics, updatedColumnStatistics);
}
Aggregations