use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class IcebergMetadata method finishInsert.
@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics) {
IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle;
Table icebergTable = transaction.table();
List<CommitTaskData> commitTasks = fragments.stream().map(slice -> commitTaskCodec.fromJson(slice.getBytes())).collect(toImmutableList());
Type[] partitionColumnTypes = icebergTable.spec().fields().stream().map(field -> field.transform().getResultType(icebergTable.schema().findType(field.sourceId()))).toArray(Type[]::new);
AppendFiles appendFiles = transaction.newAppend();
ImmutableSet.Builder<String> writtenFiles = ImmutableSet.builder();
for (CommitTaskData task : commitTasks) {
DataFiles.Builder builder = DataFiles.builder(icebergTable.spec()).withPath(task.getPath()).withFileSizeInBytes(task.getFileSizeInBytes()).withFormat(table.getFileFormat().toIceberg()).withMetrics(task.getMetrics().metrics());
if (!icebergTable.spec().fields().isEmpty()) {
String partitionDataJson = task.getPartitionDataJson().orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes));
}
appendFiles.appendFile(builder.build());
writtenFiles.add(task.getPath());
}
// try to leave as little garbage as possible behind
if (table.getRetryMode() != NO_RETRIES) {
cleanExtraOutputFiles(session, writtenFiles.build());
}
appendFiles.commit();
transaction.commitTransaction();
transaction = null;
return Optional.of(new HiveWrittenPartitions(commitTasks.stream().map(CommitTaskData::getPath).collect(toImmutableList())));
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class IcebergMetadata method finishRefreshMaterializedView.
@Override
public Optional<ConnectorOutputMetadata> finishRefreshMaterializedView(ConnectorSession session, ConnectorTableHandle tableHandle, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics, List<ConnectorTableHandle> sourceTableHandles) {
// delete before insert .. simulating overwrite
executeDelete(session, tableHandle);
IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle;
Table icebergTable = transaction.table();
List<CommitTaskData> commitTasks = fragments.stream().map(slice -> commitTaskCodec.fromJson(slice.getBytes())).collect(toImmutableList());
Type[] partitionColumnTypes = icebergTable.spec().fields().stream().map(field -> field.transform().getResultType(icebergTable.schema().findType(field.sourceId()))).toArray(Type[]::new);
AppendFiles appendFiles = transaction.newFastAppend();
ImmutableSet.Builder<String> writtenFiles = ImmutableSet.builder();
for (CommitTaskData task : commitTasks) {
DataFiles.Builder builder = DataFiles.builder(icebergTable.spec()).withPath(task.getPath()).withFileSizeInBytes(task.getFileSizeInBytes()).withFormat(table.getFileFormat().toIceberg()).withMetrics(task.getMetrics().metrics());
if (!icebergTable.spec().fields().isEmpty()) {
String partitionDataJson = task.getPartitionDataJson().orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes));
}
appendFiles.appendFile(builder.build());
writtenFiles.add(task.getPath());
}
String dependencies = sourceTableHandles.stream().map(handle -> (IcebergTableHandle) handle).filter(handle -> handle.getSnapshotId().isPresent()).map(handle -> handle.getSchemaTableName() + "=" + handle.getSnapshotId().get()).distinct().collect(joining(","));
// try to leave as little garbage as possible behind
if (table.getRetryMode() != NO_RETRIES) {
cleanExtraOutputFiles(session, writtenFiles.build());
}
// Update the 'dependsOnTables' property that tracks tables on which the materialized view depends and the corresponding snapshot ids of the tables
appendFiles.set(DEPENDS_ON_TABLES, dependencies);
appendFiles.commit();
transaction.commitTransaction();
transaction = null;
return Optional.of(new HiveWrittenPartitions(commitTasks.stream().map(CommitTaskData::getPath).collect(toImmutableList())));
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class TestStatistics method testFromComputedStatistics.
@Test
public void testFromComputedStatistics() {
Function<Integer, Block> singleIntegerValueBlock = value -> BigintType.BIGINT.createBlockBuilder(null, 1).writeLong(value).build();
ComputedStatistics statistics = ComputedStatistics.builder(ImmutableList.of(), ImmutableList.of()).addTableStatistic(TableStatisticType.ROW_COUNT, singleIntegerValueBlock.apply(5)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MIN_VALUE), singleIntegerValueBlock.apply(1)).addColumnStatistic(new ColumnStatisticMetadata("a_column", MAX_VALUE), singleIntegerValueBlock.apply(5)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_DISTINCT_VALUES), singleIntegerValueBlock.apply(5)).addColumnStatistic(new ColumnStatisticMetadata("a_column", NUMBER_OF_NON_NULL_VALUES), singleIntegerValueBlock.apply(5)).addColumnStatistic(new ColumnStatisticMetadata("b_column", NUMBER_OF_NON_NULL_VALUES), singleIntegerValueBlock.apply(4)).build();
Map<String, Type> columnTypes = ImmutableMap.of("a_column", INTEGER, "b_column", VARCHAR);
Map<String, HiveColumnStatistics> columnStatistics = Statistics.fromComputedStatistics(statistics.getColumnStatistics(), columnTypes, 5);
assertThat(columnStatistics).hasSize(2);
assertThat(columnStatistics.keySet()).contains("a_column", "b_column");
assertThat(columnStatistics.get("a_column")).isEqualTo(HiveColumnStatistics.builder().setIntegerStatistics(new IntegerStatistics(OptionalLong.of(1), OptionalLong.of(5))).setNullsCount(0).setDistinctValuesCount(5).build());
assertThat(columnStatistics.get("b_column")).isEqualTo(HiveColumnStatistics.builder().setNullsCount(1).build());
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class HiveMetadata method finishStatisticsCollection.
@Override
public void finishStatisticsCollection(ConnectorSession session, ConnectorTableHandle tableHandle, Collection<ComputedStatistics> computedStatistics) {
HiveTableHandle handle = (HiveTableHandle) tableHandle;
SchemaTableName tableName = handle.getSchemaTableName();
Table table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName()));
List<Column> partitionColumns = table.getPartitionColumns();
List<String> partitionColumnNames = partitionColumns.stream().map(Column::getName).collect(toImmutableList());
HiveTimestampPrecision timestampPrecision = getTimestampPrecision(session);
List<HiveColumnHandle> hiveColumnHandles = hiveColumnHandles(table, typeManager, timestampPrecision);
Map<String, Type> columnTypes = hiveColumnHandles.stream().filter(columnHandle -> !columnHandle.isHidden()).collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager, timestampPrecision)));
Map<List<String>, ComputedStatistics> computedStatisticsMap = createComputedStatisticsToPartitionMap(computedStatistics, partitionColumnNames, columnTypes);
if (partitionColumns.isEmpty()) {
// commit analyze to unpartitioned table
metastore.setTableStatistics(table, createPartitionStatistics(columnTypes, computedStatisticsMap.get(ImmutableList.<String>of())));
} else {
List<List<String>> partitionValuesList;
if (handle.getAnalyzePartitionValues().isPresent()) {
partitionValuesList = handle.getAnalyzePartitionValues().get();
} else {
partitionValuesList = metastore.getPartitionNames(handle.getSchemaName(), handle.getTableName()).orElseThrow(() -> new TableNotFoundException(((HiveTableHandle) tableHandle).getSchemaTableName())).stream().map(HiveUtil::toPartitionValues).collect(toImmutableList());
}
ImmutableMap.Builder<List<String>, PartitionStatistics> partitionStatistics = ImmutableMap.builder();
Map<String, Set<ColumnStatisticType>> columnStatisticTypes = hiveColumnHandles.stream().filter(columnHandle -> !partitionColumnNames.contains(columnHandle.getName())).filter(column -> !column.isHidden()).collect(toImmutableMap(HiveColumnHandle::getName, column -> ImmutableSet.copyOf(metastore.getSupportedColumnStatistics(column.getType()))));
Supplier<PartitionStatistics> emptyPartitionStatistics = Suppliers.memoize(() -> createEmptyPartitionStatistics(columnTypes, columnStatisticTypes));
int usedComputedStatistics = 0;
for (List<String> partitionValues : partitionValuesList) {
ComputedStatistics collectedStatistics = computedStatisticsMap.get(partitionValues);
if (collectedStatistics == null) {
partitionStatistics.put(partitionValues, emptyPartitionStatistics.get());
} else {
usedComputedStatistics++;
partitionStatistics.put(partitionValues, createPartitionStatistics(columnTypes, collectedStatistics));
}
}
verify(usedComputedStatistics == computedStatistics.size(), "All computed statistics must be used");
metastore.setPartitionStatistics(table, partitionStatistics.buildOrThrow());
}
}
use of io.trino.spi.statistics.ComputedStatistics in project trino by trinodb.
the class HiveMetadata method finishCreateTable.
@Override
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics) {
HiveOutputTableHandle handle = (HiveOutputTableHandle) tableHandle;
List<PartitionUpdate> partitionUpdates = fragments.stream().map(Slice::getBytes).map(partitionUpdateCodec::fromJson).collect(toImmutableList());
WriteInfo writeInfo = locationService.getQueryWriteInfo(handle.getLocationHandle());
Table table = buildTableObject(session.getQueryId(), handle.getSchemaName(), handle.getTableName(), handle.getTableOwner(), handle.getInputColumns(), handle.getTableStorageFormat(), handle.getPartitionedBy(), handle.getBucketProperty(), handle.getAdditionalTableParameters(), Optional.of(writeInfo.getTargetPath()), handle.isExternal(), prestoVersion, accessControlMetadata.isUsingSystemSecurity());
PrincipalPrivileges principalPrivileges = accessControlMetadata.isUsingSystemSecurity() ? NO_PRIVILEGES : buildInitialPrivilegeSet(handle.getTableOwner());
partitionUpdates = PartitionUpdate.mergePartitionUpdates(partitionUpdates);
if (handle.getBucketProperty().isPresent() && isCreateEmptyBucketFiles(session)) {
List<PartitionUpdate> partitionUpdatesForMissingBuckets = computePartitionUpdatesForMissingBuckets(session, handle, table, true, partitionUpdates);
// replace partitionUpdates before creating the empty files so that those files will be cleaned up if we end up rollback
partitionUpdates = PartitionUpdate.mergePartitionUpdates(concat(partitionUpdates, partitionUpdatesForMissingBuckets));
for (PartitionUpdate partitionUpdate : partitionUpdatesForMissingBuckets) {
Optional<Partition> partition = table.getPartitionColumns().isEmpty() ? Optional.empty() : Optional.of(buildPartitionObject(session, table, partitionUpdate));
createEmptyFiles(session, partitionUpdate.getWritePath(), table, partition, partitionUpdate.getFileNames());
}
if (handle.isTransactional()) {
AcidTransaction transaction = handle.getTransaction();
List<String> partitionNames = partitionUpdates.stream().map(PartitionUpdate::getName).collect(toImmutableList());
metastore.addDynamicPartitions(handle.getSchemaName(), handle.getTableName(), partitionNames, transaction.getAcidTransactionId(), transaction.getWriteId(), AcidOperation.CREATE_TABLE);
}
}
Map<String, Type> columnTypes = handle.getInputColumns().stream().collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager)));
Map<List<String>, ComputedStatistics> partitionComputedStatistics = createComputedStatisticsToPartitionMap(computedStatistics, handle.getPartitionedBy(), columnTypes);
PartitionStatistics tableStatistics;
if (table.getPartitionColumns().isEmpty()) {
HiveBasicStatistics basicStatistics = partitionUpdates.stream().map(PartitionUpdate::getStatistics).reduce((first, second) -> reduce(first, second, ADD)).orElse(createZeroStatistics());
tableStatistics = createPartitionStatistics(basicStatistics, columnTypes, getColumnStatistics(partitionComputedStatistics, ImmutableList.of()));
} else {
tableStatistics = new PartitionStatistics(createEmptyStatistics(), ImmutableMap.of());
}
if (handle.getPartitionedBy().isEmpty()) {
List<String> fileNames;
if (partitionUpdates.isEmpty()) {
// creating empty table via CTAS ... WITH NO DATA
fileNames = ImmutableList.of();
} else {
fileNames = getOnlyElement(partitionUpdates).getFileNames();
}
metastore.createTable(session, table, principalPrivileges, Optional.of(writeInfo.getWritePath()), Optional.of(fileNames), false, tableStatistics, handle.isRetriesEnabled());
} else {
metastore.createTable(session, table, principalPrivileges, Optional.of(writeInfo.getWritePath()), Optional.empty(), false, tableStatistics, false);
}
if (!handle.getPartitionedBy().isEmpty()) {
if (isRespectTableFormat(session)) {
verify(handle.getPartitionStorageFormat() == handle.getTableStorageFormat());
}
for (PartitionUpdate update : partitionUpdates) {
Partition partition = buildPartitionObject(session, table, update);
PartitionStatistics partitionStatistics = createPartitionStatistics(update.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partition.getValues()));
metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), buildPartitionObject(session, table, update), update.getWritePath(), Optional.of(update.getFileNames()), partitionStatistics, handle.isRetriesEnabled());
}
}
return Optional.of(new HiveWrittenPartitions(partitionUpdates.stream().map(PartitionUpdate::getName).collect(toImmutableList())));
}
Aggregations