use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculatePartitionsRowCount.
@VisibleForTesting
static Optional<PartitionsRowCount> calculatePartitionsRowCount(Collection<PartitionStatistics> statistics, int queriedPartitionsCount) {
long[] rowCounts = statistics.stream().map(PartitionStatistics::getBasicStatistics).map(HiveBasicStatistics::getRowCount).filter(OptionalLong::isPresent).mapToLong(OptionalLong::getAsLong).peek(count -> verify(count >= 0, "count must be greater than or equal to zero")).toArray();
int sampleSize = statistics.size();
// Sample contains all the queried partitions, estimate avg normally
if (rowCounts.length <= 2 || queriedPartitionsCount == sampleSize) {
OptionalDouble averageRowsPerPartitionOptional = Arrays.stream(rowCounts).average();
if (averageRowsPerPartitionOptional.isEmpty()) {
return Optional.empty();
}
double averageRowsPerPartition = averageRowsPerPartitionOptional.getAsDouble();
return Optional.of(new PartitionsRowCount(averageRowsPerPartition, averageRowsPerPartition * queriedPartitionsCount));
}
// Some partitions (e.g. __HIVE_DEFAULT_PARTITION__) may be outliers in terms of row count.
// Excluding the min and max rowCount values from averageRowsPerPartition calculation helps to reduce the
// possibility of errors in the extrapolated rowCount due to a couple of outliers.
int minIndex = 0;
int maxIndex = 0;
long rowCountSum = rowCounts[0];
for (int index = 1; index < rowCounts.length; index++) {
if (rowCounts[index] < rowCounts[minIndex]) {
minIndex = index;
} else if (rowCounts[index] > rowCounts[maxIndex]) {
maxIndex = index;
}
rowCountSum += rowCounts[index];
}
double averageWithoutOutliers = ((double) (rowCountSum - rowCounts[minIndex] - rowCounts[maxIndex])) / (rowCounts.length - 2);
double rowCount = (averageWithoutOutliers * (queriedPartitionsCount - 2)) + rowCounts[minIndex] + rowCounts[maxIndex];
return Optional.of(new PartitionsRowCount(averageWithoutOutliers, rowCount));
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class TestHiveGlueMetastore method testInvalidColumnStatisticsMetadata.
@Test
public void testInvalidColumnStatisticsMetadata() throws Exception {
SchemaTableName tableName = temporaryTable("test_statistics_invalid_column_metadata");
try {
List<ColumnMetadata> columns = List.of(new ColumnMetadata("column1", BIGINT));
Map<String, HiveColumnStatistics> columnStatistics = Map.of("column1", INTEGER_COLUMN_STATISTICS);
PartitionStatistics partitionStatistics = PartitionStatistics.builder().setBasicStatistics(HIVE_BASIC_STATISTICS).setColumnStatistics(columnStatistics).build();
doCreateEmptyTable(tableName, ORC, columns);
// set table statistics for column1
metastore.updateTableStatistics(tableName.getSchemaName(), tableName.getTableName(), NO_ACID_TRANSACTION, actualStatistics -> {
assertThat(actualStatistics).isEqualTo(EMPTY_TABLE_STATISTICS);
return partitionStatistics;
});
Table table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName()).get();
TableInput tableInput = GlueInputConverter.convertTable(table);
tableInput.setParameters(ImmutableMap.<String, String>builder().putAll(tableInput.getParameters()).put("column_stats_bad_data", "bad data").buildOrThrow());
getGlueClient().updateTable(new UpdateTableRequest().withDatabaseName(tableName.getSchemaName()).withTableInput(tableInput));
assertThat(metastore.getTableStatistics(tableName.getSchemaName(), tableName.getTableName())).isEqualTo(partitionStatistics);
} finally {
dropTable(tableName);
}
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class TestHiveGlueMetastore method testStatisticsLargeNumberOfColumns.
@Test
public void testStatisticsLargeNumberOfColumns() throws Exception {
SchemaTableName tableName = temporaryTable("test_statistics_large_number_of_columns");
try {
ImmutableList.Builder<ColumnMetadata> columns = ImmutableList.builder();
ImmutableMap.Builder<String, HiveColumnStatistics> columnStatistics = ImmutableMap.builder();
for (int i = 1; i < 1500; ++i) {
String columnName = "t_bigint " + i + "_" + String.join("", Collections.nCopies(240, "x"));
columns.add(new ColumnMetadata(columnName, BIGINT));
columnStatistics.put(columnName, createIntegerColumnStatistics(OptionalLong.of(-1000 - i), OptionalLong.of(1000 + i), OptionalLong.of(i), OptionalLong.of(2 * i)));
}
PartitionStatistics partitionStatistics = PartitionStatistics.builder().setBasicStatistics(HIVE_BASIC_STATISTICS).setColumnStatistics(columnStatistics.buildOrThrow()).build();
doCreateEmptyTable(tableName, ORC, columns.build());
testUpdateTableStatistics(tableName, EMPTY_TABLE_STATISTICS, partitionStatistics);
} finally {
dropTable(tableName);
}
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class TestMetastoreHiveStatisticsProvider method testGetTableStatistics.
@Test
public void testGetTableStatistics() {
String partitionName = "p1=string1/p2=1234";
PartitionStatistics statistics = PartitionStatistics.builder().setBasicStatistics(new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(1000), OptionalLong.empty(), OptionalLong.empty())).setColumnStatistics(ImmutableMap.of(COLUMN, createIntegerColumnStatistics(OptionalLong.of(-100), OptionalLong.of(100), OptionalLong.of(500), OptionalLong.of(300)))).build();
MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, table, hivePartitions) -> ImmutableMap.of(partitionName, statistics));
HiveColumnHandle columnHandle = createBaseColumn(COLUMN, 2, HIVE_LONG, BIGINT, REGULAR, Optional.empty());
TableStatistics expected = TableStatistics.builder().setRowCount(Estimate.of(1000)).setColumnStatistics(PARTITION_COLUMN_1, ColumnStatistics.builder().setDataSize(Estimate.of(7000)).setNullsFraction(Estimate.of(0)).setDistinctValuesCount(Estimate.of(1)).build()).setColumnStatistics(PARTITION_COLUMN_2, ColumnStatistics.builder().setRange(new DoubleRange(1234, 1234)).setNullsFraction(Estimate.of(0)).setDistinctValuesCount(Estimate.of(1)).build()).setColumnStatistics(columnHandle, ColumnStatistics.builder().setRange(new DoubleRange(-100, 100)).setNullsFraction(Estimate.of(0.5)).setDistinctValuesCount(Estimate.of(300)).build()).build();
assertEquals(statisticsProvider.getTableStatistics(SESSION, TABLE, ImmutableMap.of("p1", PARTITION_COLUMN_1, "p2", PARTITION_COLUMN_2, COLUMN, columnHandle), ImmutableMap.of("p1", VARCHAR, "p2", BIGINT, COLUMN, BIGINT), ImmutableList.of(partition(partitionName))), expected);
}
use of io.trino.plugin.hive.PartitionStatistics in project trino by trinodb.
the class TestMetastoreHiveStatisticsProvider method testGetTableStatisticsValidationFailure.
@Test
public void testGetTableStatisticsValidationFailure() {
PartitionStatistics corruptedStatistics = PartitionStatistics.builder().setBasicStatistics(new HiveBasicStatistics(-1, 0, 0, 0)).build();
String partitionName = "p1=string1/p2=1234";
MetastoreHiveStatisticsProvider statisticsProvider = new MetastoreHiveStatisticsProvider((session, table, hivePartitions) -> ImmutableMap.of(partitionName, corruptedStatistics));
assertThatThrownBy(() -> statisticsProvider.getTableStatistics(getHiveSession(new HiveConfig().setIgnoreCorruptedStatistics(false)), TABLE, ImmutableMap.of(), ImmutableMap.of(), ImmutableList.of(partition(partitionName)))).isInstanceOf(TrinoException.class).hasFieldOrPropertyWithValue("errorCode", HIVE_CORRUPTED_COLUMN_STATISTICS.toErrorCode());
assertEquals(statisticsProvider.getTableStatistics(getHiveSession(new HiveConfig().setIgnoreCorruptedStatistics(true)), TABLE, ImmutableMap.of(), ImmutableMap.of(), ImmutableList.of(partition(partitionName))), TableStatistics.empty());
}
Aggregations