use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class HiveMetadata method finishInsert.
@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics) {
HiveInsertTableHandle handle = (HiveInsertTableHandle) insertHandle;
List<PartitionUpdate> partitionUpdates = fragments.stream().map(Slice::getBytes).map(partitionUpdateCodec::fromJson).collect(toImmutableList());
HiveStorageFormat tableStorageFormat = handle.getTableStorageFormat();
partitionUpdates = PartitionUpdate.mergePartitionUpdates(partitionUpdates);
Table table = metastore.getTable(handle.getSchemaName(), handle.getTableName()).orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName()));
if (!table.getStorage().getStorageFormat().getInputFormat().equals(tableStorageFormat.getInputFormat()) && isRespectTableFormat(session)) {
throw new TrinoException(HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert");
}
if (handle.getBucketProperty().isPresent() && isCreateEmptyBucketFiles(session)) {
List<PartitionUpdate> partitionUpdatesForMissingBuckets = computePartitionUpdatesForMissingBuckets(session, handle, table, false, partitionUpdates);
// replace partitionUpdates before creating the empty files so that those files will be cleaned up if we end up rollback
partitionUpdates = PartitionUpdate.mergePartitionUpdates(concat(partitionUpdates, partitionUpdatesForMissingBuckets));
for (PartitionUpdate partitionUpdate : partitionUpdatesForMissingBuckets) {
Optional<Partition> partition = table.getPartitionColumns().isEmpty() ? Optional.empty() : Optional.of(buildPartitionObject(session, table, partitionUpdate));
if (handle.isTransactional() && partition.isPresent()) {
PartitionStatistics statistics = PartitionStatistics.builder().setBasicStatistics(partitionUpdate.getStatistics()).build();
metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), partition.get(), partitionUpdate.getWritePath(), Optional.of(partitionUpdate.getFileNames()), statistics, handle.isRetriesEnabled());
}
createEmptyFiles(session, partitionUpdate.getWritePath(), table, partition, partitionUpdate.getFileNames());
}
}
List<String> partitionedBy = table.getPartitionColumns().stream().map(Column::getName).collect(toImmutableList());
Map<String, Type> columnTypes = handle.getInputColumns().stream().collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager)));
Map<List<String>, ComputedStatistics> partitionComputedStatistics = createComputedStatisticsToPartitionMap(computedStatistics, partitionedBy, columnTypes);
for (PartitionUpdate partitionUpdate : partitionUpdates) {
if (partitionUpdate.getName().isEmpty()) {
// insert into unpartitioned table
if (!table.getStorage().getStorageFormat().getInputFormat().equals(handle.getPartitionStorageFormat().getInputFormat()) && isRespectTableFormat(session)) {
throw new TrinoException(HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert");
}
PartitionStatistics partitionStatistics = createPartitionStatistics(partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, ImmutableList.of()));
if (partitionUpdate.getUpdateMode() == OVERWRITE) {
// get privileges from existing table
PrincipalPrivileges principalPrivileges = fromHivePrivilegeInfos(metastore.listTablePrivileges(handle.getSchemaName(), handle.getTableName(), Optional.empty()));
// first drop it
metastore.dropTable(session, handle.getSchemaName(), handle.getTableName());
// create the table with the new location
metastore.createTable(session, table, principalPrivileges, Optional.of(partitionUpdate.getWritePath()), Optional.of(partitionUpdate.getFileNames()), false, partitionStatistics, handle.isRetriesEnabled());
} else if (partitionUpdate.getUpdateMode() == NEW || partitionUpdate.getUpdateMode() == APPEND) {
// insert into unpartitioned table
metastore.finishInsertIntoExistingTable(session, handle.getSchemaName(), handle.getTableName(), partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), partitionStatistics, handle.isRetriesEnabled());
} else {
throw new IllegalArgumentException("Unsupported update mode: " + partitionUpdate.getUpdateMode());
}
} else if (partitionUpdate.getUpdateMode() == APPEND) {
// insert into existing partition
List<String> partitionValues = toPartitionValues(partitionUpdate.getName());
PartitionStatistics partitionStatistics = createPartitionStatistics(partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partitionValues));
metastore.finishInsertIntoExistingPartition(session, handle.getSchemaName(), handle.getTableName(), partitionValues, partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), partitionStatistics, handle.isRetriesEnabled());
} else if (partitionUpdate.getUpdateMode() == NEW || partitionUpdate.getUpdateMode() == OVERWRITE) {
// insert into new partition or overwrite existing partition
Partition partition = buildPartitionObject(session, table, partitionUpdate);
if (!partition.getStorage().getStorageFormat().getInputFormat().equals(handle.getPartitionStorageFormat().getInputFormat()) && isRespectTableFormat(session)) {
throw new TrinoException(HIVE_CONCURRENT_MODIFICATION_DETECTED, "Partition format changed during insert");
}
PartitionStatistics partitionStatistics = createPartitionStatistics(partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partition.getValues()));
if (partitionUpdate.getUpdateMode() == OVERWRITE) {
if (handle.getLocationHandle().getWriteMode() == DIRECT_TO_TARGET_EXISTING_DIRECTORY) {
removeNonCurrentQueryFiles(session, partitionUpdate.getTargetPath());
if (handle.isRetriesEnabled()) {
HdfsContext hdfsContext = new HdfsContext(session);
cleanExtraOutputFiles(hdfsEnvironment, hdfsContext, session.getQueryId(), partitionUpdate.getTargetPath(), ImmutableSet.copyOf(partitionUpdate.getFileNames()));
}
} else {
metastore.dropPartition(session, handle.getSchemaName(), handle.getTableName(), partition.getValues(), true);
metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), partition, partitionUpdate.getWritePath(), Optional.of(partitionUpdate.getFileNames()), partitionStatistics, handle.isRetriesEnabled());
}
} else {
metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), partition, partitionUpdate.getWritePath(), Optional.of(partitionUpdate.getFileNames()), partitionStatistics, handle.isRetriesEnabled());
}
} else {
throw new IllegalArgumentException(format("Unsupported update mode: %s", partitionUpdate.getUpdateMode()));
}
}
if (isFullAcidTable(table.getParameters())) {
HdfsContext context = new HdfsContext(session);
for (PartitionUpdate update : partitionUpdates) {
long writeId = handle.getTransaction().getWriteId();
Path deltaDirectory = new Path(format("%s/%s/%s", table.getStorage().getLocation(), update.getName(), deltaSubdir(writeId, writeId, 0)));
createOrcAcidVersionFile(context, deltaDirectory);
}
}
return Optional.of(new HiveWrittenPartitions(partitionUpdates.stream().map(PartitionUpdate::getName).collect(toImmutableList())));
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class StatisticsWriterOperator method getOutput.
@Override
public Page getOutput() {
if (state != State.FINISHING) {
return null;
}
state = State.FINISHED;
Collection<ComputedStatistics> computedStatistics = computedStatisticsBuilder.build();
statisticsWriter.writeStatistics(computedStatistics);
// output page will only be constructed once,
// so a new PageBuilder is constructed (instead of using PageBuilder.reset)
PageBuilder page = new PageBuilder(1, TYPES);
page.declarePosition();
BlockBuilder rowsBuilder = page.getBlockBuilder(0);
if (rowCountEnabled) {
BIGINT.writeLong(rowsBuilder, getRowCount(computedStatistics));
} else {
rowsBuilder.appendNull();
}
return page.build();
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class TestHiveGlueMetastore method testUpdatePartitionedStatisticsOnCreate.
@Test
public void testUpdatePartitionedStatisticsOnCreate() {
SchemaTableName tableName = temporaryTable("update_partitioned_statistics_create");
try (Transaction transaction = newTransaction()) {
ConnectorSession session = newSession();
ConnectorMetadata metadata = transaction.getMetadata();
List<ColumnMetadata> columns = ImmutableList.of(new ColumnMetadata("a_column", BigintType.BIGINT), new ColumnMetadata("part_column", BigintType.BIGINT));
ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, createTableProperties(TEXTFILE, ImmutableList.of("part_column")));
ConnectorOutputTableHandle createTableHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty(), NO_RETRIES);
// write data
ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, createTableHandle);
MaterializedResult data = MaterializedResult.resultBuilder(session, BigintType.BIGINT, BigintType.BIGINT).row(1L, 1L).row(2L, 1L).row(3L, 1L).row(4L, 2L).row(5L, 2L).build();
sink.appendPage(data.toPage());
Collection<Slice> fragments = getFutureValue(sink.finish());
// prepare statistics
ComputedStatistics statistics1 = ComputedStatistics.builder(ImmutableList.of("part_column"), ImmutableList.of(singleValueBlock(1))).addTableStatistic(TableStatisticType.ROW_COUNT, singleValueBlock(3)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MIN_VALUE), singleValueBlock(1)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MAX_VALUE), singleValueBlock(3)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_DISTINCT_VALUES), singleValueBlock(3)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_NON_NULL_VALUES), singleValueBlock(3)).build();
ComputedStatistics statistics2 = ComputedStatistics.builder(ImmutableList.of("part_column"), ImmutableList.of(singleValueBlock(2))).addTableStatistic(TableStatisticType.ROW_COUNT, singleValueBlock(2)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MIN_VALUE), singleValueBlock(4)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MAX_VALUE), singleValueBlock(5)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_DISTINCT_VALUES), singleValueBlock(2)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_NON_NULL_VALUES), singleValueBlock(2)).build();
// finish CTAS
metadata.finishCreateTable(session, createTableHandle, fragments, ImmutableList.of(statistics1, statistics2));
transaction.commit();
} finally {
dropTable(tableName);
}
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class TestHiveGlueMetastore method testUpdateStatisticsOnCreate.
@Test
public void testUpdateStatisticsOnCreate() {
SchemaTableName tableName = temporaryTable("update_statistics_create");
try (Transaction transaction = newTransaction()) {
ConnectorSession session = newSession();
ConnectorMetadata metadata = transaction.getMetadata();
List<ColumnMetadata> columns = ImmutableList.of(new ColumnMetadata("a_column", BigintType.BIGINT));
ConnectorTableMetadata tableMetadata = new ConnectorTableMetadata(tableName, columns, createTableProperties(TEXTFILE));
ConnectorOutputTableHandle createTableHandle = metadata.beginCreateTable(session, tableMetadata, Optional.empty(), NO_RETRIES);
// write data
ConnectorPageSink sink = pageSinkProvider.createPageSink(transaction.getTransactionHandle(), session, createTableHandle);
MaterializedResult data = MaterializedResult.resultBuilder(session, BigintType.BIGINT).row(1L).row(2L).row(3L).row(4L).row(5L).build();
sink.appendPage(data.toPage());
Collection<Slice> fragments = getFutureValue(sink.finish());
// prepare statistics
ComputedStatistics statistics = ComputedStatistics.builder(ImmutableList.of(), ImmutableList.of()).addTableStatistic(TableStatisticType.ROW_COUNT, singleValueBlock(5)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MIN_VALUE), singleValueBlock(1)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MAX_VALUE), singleValueBlock(5)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_DISTINCT_VALUES), singleValueBlock(5)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_NON_NULL_VALUES), singleValueBlock(5)).build();
// finish CTAS
metadata.finishCreateTable(session, createTableHandle, fragments, ImmutableList.of(statistics));
transaction.commit();
} finally {
dropTable(tableName);
}
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class DeltaLakeMetadata method finishStatisticsCollection.
@Override
public void finishStatisticsCollection(ConnectorSession session, ConnectorTableHandle table, Collection<ComputedStatistics> computedStatistics) {
DeltaLakeTableHandle tableHandle = (DeltaLakeTableHandle) table;
AnalyzeHandle analyzeHandle = tableHandle.getAnalyzeHandle().orElseThrow(() -> new IllegalArgumentException("analyzeHandle not set"));
String location = metastore.getTableLocation(tableHandle.getSchemaTableName(), session);
Optional<DeltaLakeStatistics> oldStatistics = statisticsAccess.readDeltaLakeStatistics(session, location);
// more elaborate logic for handling statistics model evaluation may need to be introduced in the future
// for now let's have a simple check rejecting update
oldStatistics.ifPresent(statistics -> checkArgument(statistics.getModelVersion() == DeltaLakeStatistics.CURRENT_MODEL_VERSION, "Existing table statistics are incompatible, run the drop statistics procedure on this table before re-analyzing"));
Map<String, DeltaLakeColumnStatistics> oldColumnStatistics = oldStatistics.map(DeltaLakeStatistics::getColumnStatistics).orElseGet(ImmutableMap::of);
Map<String, DeltaLakeColumnStatistics> newColumnStatistics = toDeltaLakeColumnStatistics(computedStatistics);
Map<String, DeltaLakeColumnStatistics> mergedColumnStatistics = new HashMap<>();
// only keep stats for existing columns
Set<String> newColumns = newColumnStatistics.keySet();
oldColumnStatistics.entrySet().stream().filter(entry -> newColumns.contains(entry.getKey())).forEach(entry -> mergedColumnStatistics.put(entry.getKey(), entry.getValue()));
newColumnStatistics.forEach((columnName, columnStatistics) -> {
mergedColumnStatistics.merge(columnName, columnStatistics, DeltaLakeColumnStatistics::update);
});
Optional<Instant> maxFileModificationTime = getMaxFileModificationTime(computedStatistics);
// We do not want to hinder our future calls to ANALYZE if one of the files we analyzed have modification time far in the future.
// Therefore we cap the value we store in extended_stats.json to current_time as observed on Trino coordinator.
Instant finalAlreadyAnalyzedModifiedTimeMax = Instant.now();
if (maxFileModificationTime.isPresent()) {
finalAlreadyAnalyzedModifiedTimeMax = Comparators.min(maxFileModificationTime.get(), finalAlreadyAnalyzedModifiedTimeMax);
}
// also ensure that we are not traveling back in time
if (oldStatistics.isPresent()) {
finalAlreadyAnalyzedModifiedTimeMax = Comparators.max(oldStatistics.get().getAlreadyAnalyzedModifiedTimeMax(), finalAlreadyAnalyzedModifiedTimeMax);
}
if (analyzeHandle.getColumns().isPresent() && !mergedColumnStatistics.keySet().equals(analyzeHandle.getColumns().get())) {
// sanity validation
throw new IllegalStateException(format("Unexpected columns in in mergedColumnStatistics %s; expected %s", mergedColumnStatistics.keySet(), analyzeHandle.getColumns().get()));
}
DeltaLakeStatistics mergedDeltaLakeStatistics = new DeltaLakeStatistics(finalAlreadyAnalyzedModifiedTimeMax, mergedColumnStatistics, analyzeHandle.getColumns());
statisticsAccess.updateDeltaLakeStatistics(session, location, mergedDeltaLakeStatistics);
}
Aggregations