use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class FileHiveMetastore method getTableStatistics.
private synchronized PartitionStatistics getTableStatistics(String databaseName, String tableName) {
Path tableMetadataDirectory = getTableMetadataDirectory(databaseName, tableName);
TableMetadata tableMetadata = readSchemaFile(TABLE, tableMetadataDirectory, tableCodec).orElseThrow(() -> new TableNotFoundException(new SchemaTableName(databaseName, tableName)));
checkVersion(tableMetadata.getWriterVersion());
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(tableMetadata.getParameters());
Map<String, HiveColumnStatistics> columnStatistics = tableMetadata.getColumnStatistics();
return new PartitionStatistics(basicStatistics, columnStatistics);
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class ThriftMetastoreUtil method getHiveBasicStatistics.
public static HiveBasicStatistics getHiveBasicStatistics(Map<String, String> parameters) {
OptionalLong numFiles = parse(parameters.get(NUM_FILES));
OptionalLong numRows = parse(parameters.get(NUM_ROWS));
OptionalLong inMemoryDataSizeInBytes = parse(parameters.get(RAW_DATA_SIZE));
OptionalLong onDiskDataSizeInBytes = parse(parameters.get(TOTAL_SIZE));
return new HiveBasicStatistics(numFiles, numRows, inMemoryDataSizeInBytes, onDiskDataSizeInBytes);
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class ThriftHiveMetastore method getTableStatistics.
@Override
public PartitionStatistics getTableStatistics(HiveIdentity identity, Table table) {
List<String> dataColumns = table.getSd().getCols().stream().map(FieldSchema::getName).collect(toImmutableList());
HiveBasicStatistics basicStatistics = getHiveBasicStatistics(table.getParameters());
Map<String, HiveColumnStatistics> columnStatistics = getTableColumnStatistics(identity, table.getDbName(), table.getTableName(), dataColumns, basicStatistics.getRowCount());
return new PartitionStatistics(basicStatistics, columnStatistics);
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class MetastoreHiveStatisticsProvider method calculatePartitionsRowCount.
@VisibleForTesting
static Optional<PartitionsRowCount> calculatePartitionsRowCount(Collection<PartitionStatistics> statistics, int queriedPartitionsCount) {
long[] rowCounts = statistics.stream().map(PartitionStatistics::getBasicStatistics).map(HiveBasicStatistics::getRowCount).filter(OptionalLong::isPresent).mapToLong(OptionalLong::getAsLong).peek(count -> verify(count >= 0, "count must be greater than or equal to zero")).toArray();
int sampleSize = statistics.size();
// Sample contains all the queried partitions, estimate avg normally
if (rowCounts.length <= 2 || queriedPartitionsCount == sampleSize) {
OptionalDouble averageRowsPerPartitionOptional = Arrays.stream(rowCounts).average();
if (averageRowsPerPartitionOptional.isEmpty()) {
return Optional.empty();
}
double averageRowsPerPartition = averageRowsPerPartitionOptional.getAsDouble();
return Optional.of(new PartitionsRowCount(averageRowsPerPartition, averageRowsPerPartition * queriedPartitionsCount));
}
// Some partitions (e.g. __HIVE_DEFAULT_PARTITION__) may be outliers in terms of row count.
// Excluding the min and max rowCount values from averageRowsPerPartition calculation helps to reduce the
// possibility of errors in the extrapolated rowCount due to a couple of outliers.
int minIndex = 0;
int maxIndex = 0;
long rowCountSum = rowCounts[0];
for (int index = 1; index < rowCounts.length; index++) {
if (rowCounts[index] < rowCounts[minIndex]) {
minIndex = index;
} else if (rowCounts[index] > rowCounts[maxIndex]) {
maxIndex = index;
}
rowCountSum += rowCounts[index];
}
double averageWithoutOutliers = ((double) (rowCountSum - rowCounts[minIndex] - rowCounts[maxIndex])) / (rowCounts.length - 2);
double rowCount = (averageWithoutOutliers * (queriedPartitionsCount - 2)) + rowCounts[minIndex] + rowCounts[maxIndex];
return Optional.of(new PartitionsRowCount(averageWithoutOutliers, rowCount));
}
use of io.trino.plugin.hive.HiveBasicStatistics in project trino by trinodb.
the class TestStatistics method testReduce.
@Test
public void testReduce() {
assertThat(reduce(createEmptyStatistics(), createEmptyStatistics(), ADD)).isEqualTo(createEmptyStatistics());
assertThat(reduce(createZeroStatistics(), createEmptyStatistics(), ADD)).isEqualTo(createEmptyStatistics());
assertThat(reduce(createEmptyStatistics(), createZeroStatistics(), ADD)).isEqualTo(createEmptyStatistics());
assertThat(reduce(createEmptyStatistics(), createEmptyStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics());
assertThat(reduce(createZeroStatistics(), createEmptyStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics());
assertThat(reduce(createEmptyStatistics(), createZeroStatistics(), SUBTRACT)).isEqualTo(createEmptyStatistics());
assertThat(reduce(new HiveBasicStatistics(11, 9, 7, 5), new HiveBasicStatistics(1, 2, 3, 4), ADD)).isEqualTo(new HiveBasicStatistics(12, 11, 10, 9));
assertThat(reduce(new HiveBasicStatistics(11, 9, 7, 5), new HiveBasicStatistics(1, 2, 3, 4), SUBTRACT)).isEqualTo(new HiveBasicStatistics(10, 7, 4, 1));
}
Aggregations