use of io.trino.plugin.hive.metastore.HiveColumnStatistics in project trino by trinodb.
the class FileHiveMetastore method getTableStatistics.
private synchronized PartitionStatistics getTableStatistics(String databaseName, String tableName) {
Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName);
TableMetadata tableMetadata = readSchemaFile(TABLE, tableMetadataDirectory, tableCodec).orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName)));
checkVersion(tableMetadata.getWriterVersion());
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(tableMetadata.getParameters());
Map<String, HiveColumnStatistics> columnStatistics = tableMetadata.getColumnStatistics();
return new PartitionStatistics(basicStatistics, columnStatistics);
}
use of io.trino.plugin.hive.metastore.HiveColumnStatistics in project trino by trinodb.
the class GlueStatConverter method toColumnStatistics.
private static ColumnStatistics toColumnStatistics(Column column, HiveColumnStatistics statistics, OptionalLong rowCount) {
ColumnStatistics columnStatistics = new ColumnStatistics();
HiveType columnType = column.getType();
columnStatistics.setColumnName(column.getName());
columnStatistics.setColumnType(columnType.toString());
ColumnStatisticsData catalogColumnStatisticsData = toGlueColumnStatisticsData(statistics, columnType, rowCount);
columnStatistics.setStatisticsData(catalogColumnStatisticsData);
columnStatistics.setAnalyzedTime(new Date());
return columnStatistics;
}
use of io.trino.plugin.hive.metastore.HiveColumnStatistics in project trino by trinodb.
the class GlueStatConverter method toGlueColumnStatisticsData.
private static ColumnStatisticsData toGlueColumnStatisticsData(HiveColumnStatistics statistics, HiveType columnType, OptionalLong rowCount) {
TypeInfo typeInfo = columnType.getTypeInfo();
checkArgument(typeInfo.getCategory() == PRIMITIVE, "Unsupported statistics type: %s", columnType);
ColumnStatisticsData catalogColumnStatisticsData = new ColumnStatisticsData();
switch(((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory()) {
case BOOLEAN:
{
BooleanColumnStatisticsData data = new BooleanColumnStatisticsData();
statistics.getNullsCount().ifPresent(data::setNumberOfNulls);
statistics.getBooleanStatistics().ifPresent(booleanStatistics -> {
booleanStatistics.getFalseCount().ifPresent(data::setNumberOfFalses);
booleanStatistics.getTrueCount().ifPresent(data::setNumberOfTrues);
});
catalogColumnStatisticsData.setType(ColumnStatisticsType.BOOLEAN.toString());
catalogColumnStatisticsData.setBooleanColumnStatisticsData(data);
break;
}
case BINARY:
{
BinaryColumnStatisticsData data = new BinaryColumnStatisticsData();
statistics.getNullsCount().ifPresent(data::setNumberOfNulls);
data.setMaximumLength(statistics.getMaxValueSizeInBytes().orElse(0));
data.setAverageLength(getAverageColumnLength(statistics.getTotalSizeInBytes(), rowCount, statistics.getNullsCount()).orElse(0));
catalogColumnStatisticsData.setType(ColumnStatisticsType.BINARY.toString());
catalogColumnStatisticsData.setBinaryColumnStatisticsData(data);
break;
}
case DATE:
{
DateColumnStatisticsData data = new DateColumnStatisticsData();
statistics.getDateStatistics().ifPresent(dateStatistics -> {
dateStatistics.getMin().ifPresent(value -> data.setMinimumValue(localDateToDate(value)));
dateStatistics.getMax().ifPresent(value -> data.setMaximumValue(localDateToDate(value)));
});
statistics.getNullsCount().ifPresent(data::setNumberOfNulls);
toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumberOfDistinctValues);
catalogColumnStatisticsData.setType(ColumnStatisticsType.DATE.toString());
catalogColumnStatisticsData.setDateColumnStatisticsData(data);
break;
}
case DECIMAL:
{
DecimalColumnStatisticsData data = new DecimalColumnStatisticsData();
statistics.getDecimalStatistics().ifPresent(decimalStatistics -> {
decimalStatistics.getMin().ifPresent(value -> data.setMinimumValue(bigDecimalToGlueDecimal(value)));
decimalStatistics.getMax().ifPresent(value -> data.setMaximumValue(bigDecimalToGlueDecimal(value)));
});
statistics.getNullsCount().ifPresent(data::setNumberOfNulls);
toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumberOfDistinctValues);
catalogColumnStatisticsData.setType(ColumnStatisticsType.DECIMAL.toString());
catalogColumnStatisticsData.setDecimalColumnStatisticsData(data);
break;
}
case FLOAT:
case DOUBLE:
{
DoubleColumnStatisticsData data = new DoubleColumnStatisticsData();
statistics.getDoubleStatistics().ifPresent(doubleStatistics -> {
doubleStatistics.getMin().ifPresent(data::setMinimumValue);
doubleStatistics.getMax().ifPresent(data::setMaximumValue);
});
statistics.getNullsCount().ifPresent(data::setNumberOfNulls);
toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumberOfDistinctValues);
catalogColumnStatisticsData.setType(ColumnStatisticsType.DOUBLE.toString());
catalogColumnStatisticsData.setDoubleColumnStatisticsData(data);
break;
}
case BYTE:
case SHORT:
case INT:
case LONG:
case TIMESTAMP:
{
LongColumnStatisticsData data = new LongColumnStatisticsData();
statistics.getIntegerStatistics().ifPresent(stats -> {
stats.getMin().ifPresent(data::setMinimumValue);
stats.getMax().ifPresent(data::setMaximumValue);
});
statistics.getNullsCount().ifPresent(data::setNumberOfNulls);
toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumberOfDistinctValues);
catalogColumnStatisticsData.setType(ColumnStatisticsType.LONG.toString());
catalogColumnStatisticsData.setLongColumnStatisticsData(data);
break;
}
case VARCHAR:
case CHAR:
case STRING:
{
StringColumnStatisticsData data = new StringColumnStatisticsData();
statistics.getNullsCount().ifPresent(data::setNumberOfNulls);
toMetastoreDistinctValuesCount(statistics.getDistinctValuesCount(), statistics.getNullsCount()).ifPresent(data::setNumberOfDistinctValues);
data.setMaximumLength(statistics.getMaxValueSizeInBytes().orElse(0));
data.setAverageLength(getAverageColumnLength(statistics.getTotalSizeInBytes(), rowCount, statistics.getNullsCount()).orElse(0));
catalogColumnStatisticsData.setType(ColumnStatisticsType.STRING.toString());
catalogColumnStatisticsData.setStringColumnStatisticsData(data);
break;
}
default:
throw new TrinoException(HIVE_INVALID_METADATA, "Invalid column statistics type: " + ((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory());
}
return catalogColumnStatisticsData;
}
use of io.trino.plugin.hive.metastore.HiveColumnStatistics in project trino by trinodb.
the class ThriftHiveMetastore method getTableStatistics.
@Override
public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) {
List<String> dataColumns = table.getSd().getCols().stream().map(FieldSchema::getName).collect(toImmutableList());
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(table.getParameters());
Map<String, HiveColumnStatistics> columnStatistics = getTableColumnStatistics(identity, table.getDbName(), table.getTableName(), dataColumns, basicStatistics.getRowCount());
return new PartitionStatistics(basicStatistics, columnStatistics);
}
use of io.trino.plugin.hive.metastore.HiveColumnStatistics in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculateDataSize.
@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
return false;
}
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
if (columnStatistics == null) {
return false;
}
return columnStatistics.getTotalSizeInBytes().isPresent();
}).collect(toImmutableList());
if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
return Estimate.unknown();
}
long knownRowCount = 0;
long knownDataSize = 0;
for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
verifyNotNull(columnStatistics, "columnStatistics is null");
long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
knownRowCount += rowCount;
knownDataSize += dataSize;
}
if (totalRowCount == 0) {
return Estimate.zero();
}
if (knownRowCount == 0) {
return Estimate.unknown();
}
double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
Aggregations