use of io.prestosql.spi.statistics.ComputedStatistics in project hetu-core by openlookeng.
the class HiveMetadata method finishInsertInternal.
private Optional<ConnectorOutputMetadata> finishInsertInternal(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics, List<PartitionUpdate> partitions, HiveACIDWriteType hiveACIDWriteType) {
HiveInsertTableHandle handle = (HiveInsertTableHandle) insertHandle;
List<PartitionUpdate> partitionUpdates = fragments.stream().map(Slice::getBytes).map(partitionUpdateCodec::fromJson).sorted(// sort partition updates to ensure same sequence of rename in case of
Comparator.comparing(PartitionUpdate::getName)).collect(toList());
HiveStorageFormat tableStorageFormat = handle.getTableStorageFormat();
partitionUpdates = PartitionUpdate.mergePartitionUpdates(partitionUpdates);
if (session.isSnapshotEnabled()) {
Set<String> mergedFileNames = collectMergedFileNames(partitionUpdates);
updateSnapshotFiles(session, handle, false, mergedFileNames, OptionalLong.empty());
// Remove suffix from file names in partition updates
partitionUpdates = updateSnapshotFileNames(partitionUpdates, session.getQueryId());
}
Table table = metastore.getTable(new HiveIdentity(session), handle.getSchemaName(), handle.getTableName()).orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName()));
if (!table.getStorage().getStorageFormat().getInputFormat().equals(tableStorageFormat.getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) {
throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert");
}
if (handle.getBucketProperty().isPresent() && HiveSessionProperties.isCreateEmptyBucketFiles(session)) {
List<PartitionUpdate> partitionUpdatesForMissingBuckets = computePartitionUpdatesForMissingBuckets(session, handle, table, partitionUpdates);
// replace partitionUpdates before creating the empty files so that those files will be cleaned up if we end up rollback
partitionUpdates = PartitionUpdate.mergePartitionUpdates(concat(partitionUpdates, partitionUpdatesForMissingBuckets));
for (PartitionUpdate partitionUpdate : partitionUpdatesForMissingBuckets) {
Optional<Partition> partition = table.getPartitionColumns().isEmpty() ? Optional.empty() : Optional.of(buildPartitionObject(session, table, partitionUpdate));
createEmptyFiles(session, partitionUpdate.getWritePath(), table, partition, partitionUpdate.getFileNames());
}
}
List<String> partitionedBy = table.getPartitionColumns().stream().map(Column::getName).collect(toImmutableList());
Map<String, Type> columnTypes = handle.getInputColumns().stream().collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager)));
Map<List<String>, ComputedStatistics> partitionComputedStatistics = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, partitionedBy, columnTypes);
for (PartitionUpdate partitionUpdate : partitionUpdates) {
if (partitionUpdate.getName().isEmpty()) {
// insert into unpartitioned table
if (!table.getStorage().getStorageFormat().getInputFormat().equals(handle.getPartitionStorageFormat().getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) {
throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert");
}
PartitionStatistics partitionStatistics = createPartitionStatistics(session, partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, ImmutableList.of()));
if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) {
finishInsertOverwrite(session, handle, table, partitionUpdate, partitionStatistics);
} else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.NEW || partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.APPEND) {
// insert into unpartitioned table
metastore.finishInsertIntoExistingTable(session, handle.getSchemaName(), handle.getTableName(), partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), partitionStatistics, hiveACIDWriteType);
} else {
throw new IllegalArgumentException("Unsupported update mode: " + partitionUpdate.getUpdateMode());
}
} else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.APPEND) {
// insert into existing partition
List<String> partitionValues = toPartitionValues(partitionUpdate.getName());
PartitionStatistics partitionStatistics = createPartitionStatistics(session, partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partitionValues));
metastore.finishInsertIntoExistingPartition(session, handle.getSchemaName(), handle.getTableName(), partitionValues, partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), partitionStatistics, hiveACIDWriteType);
} else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.NEW || partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) {
finishInsertInNewPartition(session, handle, table, columnTypes, partitionUpdate, partitionComputedStatistics, hiveACIDWriteType);
} else {
throw new IllegalArgumentException(format("Unsupported update mode: %s", partitionUpdate.getUpdateMode()));
}
}
if (partitions != null) {
partitions.addAll(partitionUpdates);
}
return Optional.of(new HiveWrittenPartitions(partitionUpdates.stream().map(PartitionUpdate::getName).collect(toList())));
}
use of io.prestosql.spi.statistics.ComputedStatistics in project hetu-core by openlookeng.
the class HiveMetadata method finishStatisticsCollection.
@Override
public void finishStatisticsCollection(ConnectorSession session, ConnectorTableHandle tableHandle, Collection<ComputedStatistics> computedStatistics) {
HiveIdentity identity = new HiveIdentity(session);
HiveTableHandle handle = (HiveTableHandle) tableHandle;
SchemaTableName tableName = handle.getSchemaTableName();
Table table = metastore.getTable(identity, tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName()));
List<Column> partitionColumns = table.getPartitionColumns();
List<String> partitionColumnNames = partitionColumns.stream().map(Column::getName).collect(toImmutableList());
List<HiveColumnHandle> hiveColumnHandles = hiveColumnHandles(table);
Map<String, Type> columnTypes = hiveColumnHandles.stream().filter(columnHandle -> !columnHandle.isHidden()).collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager)));
Map<List<String>, ComputedStatistics> computedStatisticsMap = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, partitionColumnNames, columnTypes);
if (partitionColumns.isEmpty()) {
// commit analyze to unpartitioned table
metastore.setTableStatistics(identity, table, createPartitionStatistics(session, columnTypes, computedStatisticsMap.get(ImmutableList.<String>of())));
} else {
List<List<String>> partitionValuesList;
if (handle.getAnalyzePartitionValues().isPresent()) {
partitionValuesList = handle.getAnalyzePartitionValues().get();
} else {
partitionValuesList = metastore.getPartitionNames(identity, handle.getSchemaName(), handle.getTableName()).orElseThrow(() -> new TableNotFoundException(((HiveTableHandle) tableHandle).getSchemaTableName())).stream().map(HiveUtil::toPartitionValues).collect(toImmutableList());
}
ImmutableMap.Builder<List<String>, PartitionStatistics> partitionStatistics = ImmutableMap.builder();
Map<String, Set<ColumnStatisticType>> columnStatisticTypes = hiveColumnHandles.stream().filter(columnHandle -> !partitionColumnNames.contains(columnHandle.getName())).filter(column -> !column.isHidden()).collect(toImmutableMap(HiveColumnHandle::getName, column -> ImmutableSet.copyOf(metastore.getSupportedColumnStatistics(typeManager.getType(column.getTypeSignature())))));
Supplier<PartitionStatistics> emptyPartitionStatistics = Suppliers.memoize(() -> Statistics.createEmptyPartitionStatistics(columnTypes, columnStatisticTypes));
int usedComputedStatistics = 0;
for (List<String> partitionValues : partitionValuesList) {
ComputedStatistics collectedStatistics = computedStatisticsMap.get(partitionValues);
if (collectedStatistics == null) {
partitionStatistics.put(partitionValues, emptyPartitionStatistics.get());
} else {
usedComputedStatistics++;
partitionStatistics.put(partitionValues, createPartitionStatistics(session, columnTypes, collectedStatistics));
}
}
verify(usedComputedStatistics == computedStatistics.size(), "All computed statistics must be used");
metastore.setPartitionStatistics(identity, table, partitionStatistics.build());
}
}
use of io.prestosql.spi.statistics.ComputedStatistics in project boostkit-bigdata by kunpengcompute.
the class HiveMetadata method finishCreateTable.
public Optional<ConnectorOutputMetadata> finishCreateTable(ConnectorSession session, ConnectorOutputTableHandle tableHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics, Map<String, String> serdeParameters) {
HiveOutputTableHandle handle = (HiveOutputTableHandle) tableHandle;
List<PartitionUpdate> partitionUpdates = fragments.stream().map(Slice::getBytes).map(partitionUpdateCodec::fromJson).collect(toList());
LocationService.WriteInfo writeInfo = locationService.getQueryWriteInfo(handle.getLocationHandle());
Table table = buildTableObject(session.getQueryId(), handle.getSchemaName(), handle.getTableName(), handle.getTableOwner(), handle.getInputColumns(), handle.getTableStorageFormat(), handle.getPartitionedBy(), handle.getBucketProperty(), handle.getAdditionalTableParameters(), writeInfo.getTargetPath(), externalTable, prestoVersion, serdeParameters);
PrincipalPrivileges principalPrivileges = MetastoreUtil.buildInitialPrivilegeSet(handle.getTableOwner());
partitionUpdates = PartitionUpdate.mergePartitionUpdates(partitionUpdates);
if (session.isSnapshotEnabled()) {
Set<String> mergedFileNames = collectMergedFileNames(partitionUpdates);
updateSnapshotFiles(session, handle, false, mergedFileNames, OptionalLong.empty());
// Remove suffix from file names in partition updates
partitionUpdates = updateSnapshotFileNames(partitionUpdates, session.getQueryId());
}
if (handle.getBucketProperty().isPresent() && HiveSessionProperties.isCreateEmptyBucketFiles(session)) {
List<PartitionUpdate> partitionUpdatesForMissingBuckets = computePartitionUpdatesForMissingBuckets(session, handle, table, partitionUpdates);
// replace partitionUpdates before creating the empty files so that those files will be cleaned up if we end up rollback
partitionUpdates = PartitionUpdate.mergePartitionUpdates(concat(partitionUpdates, partitionUpdatesForMissingBuckets));
for (PartitionUpdate partitionUpdate : partitionUpdatesForMissingBuckets) {
Optional<Partition> partition = table.getPartitionColumns().isEmpty() ? Optional.empty() : Optional.of(buildPartitionObject(session, table, partitionUpdate));
createEmptyFiles(session, partitionUpdate.getWritePath(), table, partition, partitionUpdate.getFileNames());
}
}
Map<String, Type> columnTypes = handle.getInputColumns().stream().collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager)));
Map<List<String>, ComputedStatistics> partitionComputedStatistics = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, handle.getPartitionedBy(), columnTypes);
PartitionStatistics tableStatistics;
if (table.getPartitionColumns().isEmpty()) {
HiveBasicStatistics basicStatistics = partitionUpdates.stream().map(PartitionUpdate::getStatistics).reduce((first, second) -> Statistics.reduce(first, second, Statistics.ReduceOperator.ADD)).orElse(HiveBasicStatistics.createZeroStatistics());
tableStatistics = createPartitionStatistics(session, basicStatistics, columnTypes, getColumnStatistics(partitionComputedStatistics, ImmutableList.of()));
} else {
tableStatistics = new PartitionStatistics(HiveBasicStatistics.createEmptyStatistics(), ImmutableMap.of());
}
metastore.createTable(session, table, principalPrivileges, Optional.of(writeInfo.getWritePath()), false, tableStatistics);
if (!handle.getPartitionedBy().isEmpty()) {
if (HiveSessionProperties.isRespectTableFormat(session)) {
verify(handle.getPartitionStorageFormat() == handle.getTableStorageFormat());
}
List<? extends Future<?>> futures = partitionUpdates.stream().map(update -> hiveMetastoreClientService.submit(() -> {
Partition partition = buildPartitionObject(session, table, update);
PartitionStatistics partitionStatistics = createPartitionStatistics(session, update.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partition.getValues()));
metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), buildPartitionObject(session, table, update), update.getWritePath(), partitionStatistics, HiveACIDWriteType.NONE);
})).collect(toList());
futures.forEach(future -> {
try {
future.get();
} catch (InterruptedException | ExecutionException ignore) {
log.debug("Get future error");
}
});
}
return Optional.of(new HiveWrittenPartitions(partitionUpdates.stream().map(PartitionUpdate::getName).collect(toList())));
}
use of io.prestosql.spi.statistics.ComputedStatistics in project boostkit-bigdata by kunpengcompute.
the class HiveMetadata method finishInsertInternal.
private Optional<ConnectorOutputMetadata> finishInsertInternal(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics, List<PartitionUpdate> partitions, HiveACIDWriteType hiveACIDWriteType) {
HiveInsertTableHandle handle = (HiveInsertTableHandle) insertHandle;
List<PartitionUpdate> partitionUpdates = fragments.stream().map(Slice::getBytes).map(partitionUpdateCodec::fromJson).sorted(// sort partition updates to ensure same sequence of rename in case of
Comparator.comparing(PartitionUpdate::getName)).collect(toList());
HiveStorageFormat tableStorageFormat = handle.getTableStorageFormat();
partitionUpdates = PartitionUpdate.mergePartitionUpdates(partitionUpdates);
if (session.isSnapshotEnabled()) {
Set<String> mergedFileNames = collectMergedFileNames(partitionUpdates);
updateSnapshotFiles(session, handle, false, mergedFileNames, OptionalLong.empty());
// Remove suffix from file names in partition updates
partitionUpdates = updateSnapshotFileNames(partitionUpdates, session.getQueryId());
}
Table table = metastore.getTable(new HiveIdentity(session), handle.getSchemaName(), handle.getTableName()).orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName()));
if (!table.getStorage().getStorageFormat().getInputFormat().equals(tableStorageFormat.getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) {
throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert");
}
if (handle.getBucketProperty().isPresent() && HiveSessionProperties.isCreateEmptyBucketFiles(session)) {
List<PartitionUpdate> partitionUpdatesForMissingBuckets = computePartitionUpdatesForMissingBuckets(session, handle, table, partitionUpdates);
// replace partitionUpdates before creating the empty files so that those files will be cleaned up if we end up rollback
partitionUpdates = PartitionUpdate.mergePartitionUpdates(concat(partitionUpdates, partitionUpdatesForMissingBuckets));
for (PartitionUpdate partitionUpdate : partitionUpdatesForMissingBuckets) {
Optional<Partition> partition = table.getPartitionColumns().isEmpty() ? Optional.empty() : Optional.of(buildPartitionObject(session, table, partitionUpdate));
createEmptyFiles(session, partitionUpdate.getWritePath(), table, partition, partitionUpdate.getFileNames());
}
}
List<String> partitionedBy = table.getPartitionColumns().stream().map(Column::getName).collect(toImmutableList());
Map<String, Type> columnTypes = handle.getInputColumns().stream().collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager)));
Map<List<String>, ComputedStatistics> partitionComputedStatistics = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, partitionedBy, columnTypes);
for (PartitionUpdate partitionUpdate : partitionUpdates) {
if (partitionUpdate.getName().isEmpty()) {
// insert into unpartitioned table
if (!table.getStorage().getStorageFormat().getInputFormat().equals(handle.getPartitionStorageFormat().getInputFormat()) && HiveSessionProperties.isRespectTableFormat(session)) {
throw new PrestoException(HiveErrorCode.HIVE_CONCURRENT_MODIFICATION_DETECTED, "Table format changed during insert");
}
PartitionStatistics partitionStatistics = createPartitionStatistics(session, partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, ImmutableList.of()));
if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) {
finishInsertOverwrite(session, handle, table, partitionUpdate, partitionStatistics);
} else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.NEW || partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.APPEND) {
// insert into unpartitioned table
metastore.finishInsertIntoExistingTable(session, handle.getSchemaName(), handle.getTableName(), partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), partitionStatistics, hiveACIDWriteType);
} else {
throw new IllegalArgumentException("Unsupported update mode: " + partitionUpdate.getUpdateMode());
}
} else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.APPEND) {
// insert into existing partition
List<String> partitionValues = toPartitionValues(partitionUpdate.getName());
PartitionStatistics partitionStatistics = createPartitionStatistics(session, partitionUpdate.getStatistics(), columnTypes, getColumnStatistics(partitionComputedStatistics, partitionValues));
metastore.finishInsertIntoExistingPartition(session, handle.getSchemaName(), handle.getTableName(), partitionValues, partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), partitionStatistics, hiveACIDWriteType);
} else if (partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.NEW || partitionUpdate.getUpdateMode() == PartitionUpdate.UpdateMode.OVERWRITE) {
finishInsertInNewPartition(session, handle, table, columnTypes, partitionUpdate, partitionComputedStatistics, hiveACIDWriteType);
} else {
throw new IllegalArgumentException(format("Unsupported update mode: %s", partitionUpdate.getUpdateMode()));
}
}
if (partitions != null) {
partitions.addAll(partitionUpdates);
}
return Optional.of(new HiveWrittenPartitions(partitionUpdates.stream().map(PartitionUpdate::getName).collect(toList())));
}
use of io.prestosql.spi.statistics.ComputedStatistics in project boostkit-bigdata by kunpengcompute.
the class HiveMetadata method finishStatisticsCollection.
@Override
public void finishStatisticsCollection(ConnectorSession session, ConnectorTableHandle tableHandle, Collection<ComputedStatistics> computedStatistics) {
HiveIdentity identity = new HiveIdentity(session);
HiveTableHandle handle = (HiveTableHandle) tableHandle;
SchemaTableName tableName = handle.getSchemaTableName();
Table table = metastore.getTable(identity, tableName.getSchemaName(), tableName.getTableName()).orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName()));
List<Column> partitionColumns = table.getPartitionColumns();
List<String> partitionColumnNames = partitionColumns.stream().map(Column::getName).collect(toImmutableList());
List<HiveColumnHandle> hiveColumnHandles = hiveColumnHandles(table);
Map<String, Type> columnTypes = hiveColumnHandles.stream().filter(columnHandle -> !columnHandle.isHidden()).collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager)));
Map<List<String>, ComputedStatistics> computedStatisticsMap = Statistics.createComputedStatisticsToPartitionMap(computedStatistics, partitionColumnNames, columnTypes);
if (partitionColumns.isEmpty()) {
// commit analyze to unpartitioned table
metastore.setTableStatistics(identity, table, createPartitionStatistics(session, columnTypes, computedStatisticsMap.get(ImmutableList.<String>of())));
} else {
List<List<String>> partitionValuesList;
if (handle.getAnalyzePartitionValues().isPresent()) {
partitionValuesList = handle.getAnalyzePartitionValues().get();
} else {
partitionValuesList = metastore.getPartitionNames(identity, handle.getSchemaName(), handle.getTableName()).orElseThrow(() -> new TableNotFoundException(((HiveTableHandle) tableHandle).getSchemaTableName())).stream().map(HiveUtil::toPartitionValues).collect(toImmutableList());
}
ImmutableMap.Builder<List<String>, PartitionStatistics> partitionStatistics = ImmutableMap.builder();
Map<String, Set<ColumnStatisticType>> columnStatisticTypes = hiveColumnHandles.stream().filter(columnHandle -> !partitionColumnNames.contains(columnHandle.getName())).filter(column -> !column.isHidden()).collect(toImmutableMap(HiveColumnHandle::getName, column -> ImmutableSet.copyOf(metastore.getSupportedColumnStatistics(typeManager.getType(column.getTypeSignature())))));
Supplier<PartitionStatistics> emptyPartitionStatistics = Suppliers.memoize(() -> Statistics.createEmptyPartitionStatistics(columnTypes, columnStatisticTypes));
int usedComputedStatistics = 0;
for (List<String> partitionValues : partitionValuesList) {
ComputedStatistics collectedStatistics = computedStatisticsMap.get(partitionValues);
if (collectedStatistics == null) {
partitionStatistics.put(partitionValues, emptyPartitionStatistics.get());
} else {
usedComputedStatistics++;
partitionStatistics.put(partitionValues, createPartitionStatistics(session, columnTypes, collectedStatistics));
}
}
verify(usedComputedStatistics == computedStatistics.size(), "All computed statistics must be used");
metastore.setPartitionStatistics(identity, table, partitionStatistics.build());
}
}
Aggregations