use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.
the class TestTpchMetadata method testColumnStats.
private void testColumnStats(String schema, TpchTable<?> table, TpchColumn<?> column, Constraint constraint, ColumnStatistics expected) {
TpchTableHandle tableHandle = tpchMetadata.getTableHandle(session, new SchemaTableName(schema, table.getTableName()));
TableStatistics tableStatistics = tpchMetadata.getTableStatistics(session, tableHandle, constraint);
ColumnHandle columnHandle = tpchMetadata.getColumnHandles(session, tableHandle).get(column.getSimplifiedColumnName());
ColumnStatistics actual = tableStatistics.getColumnStatistics().get(columnHandle);
EstimateAssertion estimateAssertion = new EstimateAssertion(TOLERANCE);
estimateAssertion.assertClose(actual.getDistinctValuesCount(), expected.getDistinctValuesCount(), "distinctValuesCount");
estimateAssertion.assertClose(actual.getDataSize(), expected.getDataSize(), "dataSize");
estimateAssertion.assertClose(actual.getNullsFraction(), expected.getNullsFraction(), "nullsFraction");
estimateAssertion.assertClose(actual.getRange(), expected.getRange(), "range");
}
use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.
the class AbstractTestHive method assertTableStatsComputed.
private void assertTableStatsComputed(SchemaTableName tableName, Set<String> expectedColumnStatsColumns) {
try (Transaction transaction = newTransaction()) {
ConnectorMetadata metadata = transaction.getMetadata();
ConnectorSession session = newSession();
ConnectorTableHandle tableHandle = getTableHandle(metadata, tableName);
TableStatistics tableStatistics = metadata.getTableStatistics(session, tableHandle, Constraint.alwaysTrue());
assertFalse(tableStatistics.getRowCount().isUnknown(), "row count is unknown");
Map<String, ColumnStatistics> columnsStatistics = tableStatistics.getColumnStatistics().entrySet().stream().collect(toImmutableMap(entry -> ((HiveColumnHandle) entry.getKey()).getName(), Map.Entry::getValue));
assertEquals(columnsStatistics.keySet(), expectedColumnStatsColumns, "columns with statistics");
Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle);
columnsStatistics.forEach((columnName, columnStatistics) -> {
ColumnHandle columnHandle = columnHandles.get(columnName);
Type columnType = metadata.getColumnMetadata(session, tableHandle, columnHandle).getType();
assertFalse(columnStatistics.getNullsFraction().isUnknown(), "unknown nulls fraction for " + columnName);
assertFalse(columnStatistics.getDistinctValuesCount().isUnknown(), "unknown distinct values count for " + columnName);
if (columnType instanceof VarcharType) {
assertFalse(columnStatistics.getDataSize().isUnknown(), "unknown data size for " + columnName);
} else {
assertTrue(columnStatistics.getDataSize().isUnknown(), "unknown data size for" + columnName);
}
});
}
}
use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculateNullsFraction.
@VisibleForTesting
static Estimate calculateNullsFraction(String column, Collection<PartitionStatistics> partitionStatistics) {
List<PartitionStatistics> statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream().filter(statistics -> {
if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
return false;
}
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
if (columnStatistics == null) {
return false;
}
return columnStatistics.getNullsCount().isPresent();
}).collect(toImmutableList());
if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) {
return Estimate.unknown();
}
long totalNullsCount = 0;
long totalRowCount = 0;
for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) {
long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
verifyNotNull(columnStatistics, "columnStatistics is null");
long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present"));
verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero");
verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount);
totalNullsCount += nullsCount;
totalRowCount += rowCount;
}
if (totalRowCount == 0) {
return Estimate.zero();
}
verify(totalNullsCount <= totalRowCount, "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", totalNullsCount, totalRowCount);
return Estimate.of(((double) totalNullsCount) / totalRowCount);
}
use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method getTableStatistics.
private static TableStatistics getTableStatistics(Map<String, ColumnHandle> columns, Map<String, Type> columnTypes, List<HivePartition> partitions, Map<String, PartitionStatistics> statistics) {
if (statistics.isEmpty()) {
return TableStatistics.empty();
}
checkArgument(!partitions.isEmpty(), "partitions is empty");
Optional<PartitionsRowCount> optionalRowCount = calculatePartitionsRowCount(statistics.values(), partitions.size());
if (optionalRowCount.isEmpty()) {
return TableStatistics.empty();
}
double rowCount = optionalRowCount.get().getRowCount();
TableStatistics.Builder result = TableStatistics.builder();
result.setRowCount(Estimate.of(rowCount));
for (Map.Entry<String, ColumnHandle> column : columns.entrySet()) {
String columnName = column.getKey();
HiveColumnHandle columnHandle = (HiveColumnHandle) column.getValue();
Type columnType = columnTypes.get(columnName);
ColumnStatistics columnStatistics;
if (columnHandle.isPartitionKey()) {
double averageRowsPerPartition = optionalRowCount.get().getAverageRowsPerPartition();
columnStatistics = createPartitionColumnStatistics(columnHandle, columnType, partitions, statistics, averageRowsPerPartition, rowCount);
} else {
columnStatistics = createDataColumnStatistics(columnName, columnType, rowCount, statistics.values());
}
result.setColumnStatistics(columnHandle, columnStatistics);
}
return result.build();
}
use of io.trino.spi.statistics.ColumnStatistics in project trino by trinodb.
the class TestDeltaLakeMetastoreStatistics method testStatisticsParquetParsedStatisticsNullCount.
@Test
public void testStatisticsParquetParsedStatisticsNullCount() {
// The transaction log for this table was created so that the checkpoints only write struct statistics, not json statistics
// The table has one INTEGER column 'i' where 3 of the 9 values are null
DeltaLakeTableHandle tableHandle = registerTable("parquet_struct_statistics_null_count");
TableStatistics stats = deltaLakeMetastore.getTableStatistics(SESSION, tableHandle, Constraint.alwaysTrue());
assertEquals(stats.getRowCount(), Estimate.of(9));
Map<ColumnHandle, ColumnStatistics> statisticsMap = stats.getColumnStatistics();
ColumnStatistics columnStats = statisticsMap.get(new DeltaLakeColumnHandle("i", INTEGER, REGULAR));
assertEquals(columnStats.getNullsFraction(), Estimate.of(3.0 / 9.0));
}
Aggregations