use of io.trino.spi.statistics.Estimate in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculateDataSize.
@VisibleForTesting
static Estimate calculateDataSize(String column, Collection<PartitionStatistics> partitionStatistics, double totalRowCount) {
List<PartitionStatistics> statisticsWithKnownRowCountAndDataSize = partitionStatistics.stream().filter(statistics -> {
if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
return false;
}
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
if (columnStatistics == null) {
return false;
}
return columnStatistics.getTotalSizeInBytes().isPresent();
}).collect(toImmutableList());
if (statisticsWithKnownRowCountAndDataSize.isEmpty()) {
return Estimate.unknown();
}
long knownRowCount = 0;
long knownDataSize = 0;
for (PartitionStatistics statistics : statisticsWithKnownRowCountAndDataSize) {
long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
verifyNotNull(columnStatistics, "columnStatistics is null");
long dataSize = columnStatistics.getTotalSizeInBytes().orElseThrow(() -> new VerifyException("totalSizeInBytes is not present"));
verify(dataSize >= 0, "dataSize must be greater than or equal to zero");
knownRowCount += rowCount;
knownDataSize += dataSize;
}
if (totalRowCount == 0) {
return Estimate.zero();
}
if (knownRowCount == 0) {
return Estimate.unknown();
}
double averageValueDataSizeInBytes = ((double) knownDataSize) / knownRowCount;
return Estimate.of(averageValueDataSizeInBytes * totalRowCount);
}
use of io.trino.spi.statistics.Estimate in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculatePartitionsRowCount.
@VisibleForTesting
static Optional<PartitionsRowCount> calculatePartitionsRowCount(Collection<PartitionStatistics> statistics, int queriedPartitionsCount) {
long[] rowCounts = statistics.stream().map(PartitionStatistics::getBasicStatistics).map(HiveBasicStatistics::getRowCount).filter(OptionalLong::isPresent).mapToLong(OptionalLong::getAsLong).peek(count -> verify(count >= 0, "count must be greater than or equal to zero")).toArray();
int sampleSize = statistics.size();
// Sample contains all the queried partitions, estimate avg normally
if (rowCounts.length <= 2 || queriedPartitionsCount == sampleSize) {
OptionalDouble averageRowsPerPartitionOptional = Arrays.stream(rowCounts).average();
if (averageRowsPerPartitionOptional.isEmpty()) {
return Optional.empty();
}
double averageRowsPerPartition = averageRowsPerPartitionOptional.getAsDouble();
return Optional.of(new PartitionsRowCount(averageRowsPerPartition, averageRowsPerPartition * queriedPartitionsCount));
}
// Some partitions (e.g. __HIVE_DEFAULT_PARTITION__) may be outliers in terms of row count.
// Excluding the min and max rowCount values from averageRowsPerPartition calculation helps to reduce the
// possibility of errors in the extrapolated rowCount due to a couple of outliers.
int minIndex = 0;
int maxIndex = 0;
long rowCountSum = rowCounts[0];
for (int index = 1; index < rowCounts.length; index++) {
if (rowCounts[index] < rowCounts[minIndex]) {
minIndex = index;
} else if (rowCounts[index] > rowCounts[maxIndex]) {
maxIndex = index;
}
rowCountSum += rowCounts[index];
}
double averageWithoutOutliers = ((double) (rowCountSum - rowCounts[minIndex] - rowCounts[maxIndex])) / (rowCounts.length - 2);
double rowCount = (averageWithoutOutliers * (queriedPartitionsCount - 2)) + rowCounts[minIndex] + rowCounts[maxIndex];
return Optional.of(new PartitionsRowCount(averageWithoutOutliers, rowCount));
}
use of io.trino.spi.statistics.Estimate in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculateNullsFraction.
@VisibleForTesting
static Estimate calculateNullsFraction(String column, Collection<PartitionStatistics> partitionStatistics) {
List<PartitionStatistics> statisticsWithKnownRowCountAndNullsCount = partitionStatistics.stream().filter(statistics -> {
if (statistics.getBasicStatistics().getRowCount().isEmpty()) {
return false;
}
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
if (columnStatistics == null) {
return false;
}
return columnStatistics.getNullsCount().isPresent();
}).collect(toImmutableList());
if (statisticsWithKnownRowCountAndNullsCount.isEmpty()) {
return Estimate.unknown();
}
long totalNullsCount = 0;
long totalRowCount = 0;
for (PartitionStatistics statistics : statisticsWithKnownRowCountAndNullsCount) {
long rowCount = statistics.getBasicStatistics().getRowCount().orElseThrow(() -> new VerifyException("rowCount is not present"));
verify(rowCount >= 0, "rowCount must be greater than or equal to zero");
HiveColumnStatistics columnStatistics = statistics.getColumnStatistics().get(column);
verifyNotNull(columnStatistics, "columnStatistics is null");
long nullsCount = columnStatistics.getNullsCount().orElseThrow(() -> new VerifyException("nullsCount is not present"));
verify(nullsCount >= 0, "nullsCount must be greater than or equal to zero");
verify(nullsCount <= rowCount, "nullsCount must be less than or equal to rowCount. nullsCount: %s. rowCount: %s.", nullsCount, rowCount);
totalNullsCount += nullsCount;
totalRowCount += rowCount;
}
if (totalRowCount == 0) {
return Estimate.zero();
}
verify(totalNullsCount <= totalRowCount, "totalNullsCount must be less than or equal to totalRowCount. totalNullsCount: %s. totalRowCount: %s.", totalNullsCount, totalRowCount);
return Estimate.of(((double) totalNullsCount) / totalRowCount);
}
Aggregations