use of io.trino.spi.predicate.Domain in project trino by trinodb.
the class TestDynamicFilterService method testDynamicFilterSummaryCompletion.
@Test
public void testDynamicFilterSummaryCompletion() {
DynamicFilterService dynamicFilterService = createDynamicFilterService();
DynamicFilterId filterId = new DynamicFilterId("df");
QueryId queryId = new QueryId("query");
StageId stageId = new StageId(queryId, 0);
dynamicFilterService.registerQuery(queryId, session, ImmutableSet.of(filterId), ImmutableSet.of(filterId), ImmutableSet.of());
dynamicFilterService.stageCannotScheduleMoreTasks(stageId, 0, 3);
assertFalse(dynamicFilterService.getSummary(queryId, filterId).isPresent());
// assert initial dynamic filtering stats
DynamicFiltersStats stats = dynamicFilterService.getDynamicFilteringStats(queryId, session);
assertEquals(stats.getTotalDynamicFilters(), 1);
assertEquals(stats.getDynamicFiltersCompleted(), 0);
assertEquals(stats.getLazyDynamicFilters(), 1);
assertEquals(stats.getReplicatedDynamicFilters(), 0);
dynamicFilterService.addTaskDynamicFilters(new TaskId(stageId, 0, 0), ImmutableMap.of(filterId, singleValue(INTEGER, 1L)));
assertFalse(dynamicFilterService.getSummary(queryId, filterId).isPresent());
stats = dynamicFilterService.getDynamicFilteringStats(queryId, session);
assertEquals(stats.getDynamicFiltersCompleted(), 0);
dynamicFilterService.addTaskDynamicFilters(new TaskId(stageId, 1, 0), ImmutableMap.of(filterId, singleValue(INTEGER, 2L)));
assertFalse(dynamicFilterService.getSummary(queryId, filterId).isPresent());
stats = dynamicFilterService.getDynamicFilteringStats(queryId, session);
assertEquals(stats.getDynamicFiltersCompleted(), 0);
dynamicFilterService.addTaskDynamicFilters(new TaskId(stageId, 2, 0), ImmutableMap.of(filterId, singleValue(INTEGER, 3L)));
Optional<Domain> summary = dynamicFilterService.getSummary(queryId, filterId);
assertTrue(summary.isPresent());
assertEquals(summary.get(), multipleValues(INTEGER, ImmutableList.of(1L, 2L, 3L)));
stats = dynamicFilterService.getDynamicFilteringStats(queryId, session);
assertEquals(stats.getDynamicFiltersCompleted(), 1);
assertEquals(stats.getLazyDynamicFilters(), 1);
assertEquals(stats.getReplicatedDynamicFilters(), 0);
assertEquals(stats.getDynamicFilterDomainStats(), ImmutableList.of(new DynamicFilterDomainStats(filterId, getSimplifiedDomainString(1L, 3L, 3, INTEGER))));
}
use of io.trino.spi.predicate.Domain in project trino by trinodb.
the class DeltaLakeSplitManager method getSplits.
private Stream<DeltaLakeSplit> getSplits(ConnectorTransactionHandle transaction, DeltaLakeTableHandle tableHandle, ConnectorSession session, Optional<DataSize> maxScannedFileSize, Set<ColumnHandle> columnsCoveredByDynamicFilter, Constraint constraint) {
DeltaLakeMetastore metastore = getMetastore(session, transaction);
String tableLocation = metastore.getTableLocation(tableHandle.getSchemaTableName(), session);
List<AddFileEntry> validDataFiles = metastore.getValidDataFiles(tableHandle.getSchemaTableName(), session);
TupleDomain<DeltaLakeColumnHandle> enforcedPartitionConstraint = tableHandle.getEnforcedPartitionConstraint();
TupleDomain<DeltaLakeColumnHandle> nonPartitionConstraint = tableHandle.getNonPartitionConstraint();
// Delta Lake handles updates and deletes by copying entire data files, minus updates/deletes. Because of this we can only have one Split/UpdatablePageSource
// per file.
boolean splittable = tableHandle.getWriteType().isEmpty();
AtomicInteger remainingInitialSplits = new AtomicInteger(maxInitialSplits);
Optional<Instant> filesModifiedAfter = tableHandle.getAnalyzeHandle().flatMap(AnalyzeHandle::getFilesModifiedAfter);
Optional<Long> maxScannedFileSizeInBytes = maxScannedFileSize.map(DataSize::toBytes);
Set<String> predicatedColumnNames = Stream.concat(nonPartitionConstraint.getDomains().orElseThrow().keySet().stream(), columnsCoveredByDynamicFilter.stream().map(DeltaLakeColumnHandle.class::cast)).map(// TODO is DeltaLakeColumnHandle.name normalized?
column -> column.getName().toLowerCase(ENGLISH)).collect(toImmutableSet());
List<ColumnMetadata> schema = extractSchema(tableHandle.getMetadataEntry(), typeManager);
List<ColumnMetadata> predicatedColumns = schema.stream().filter(// ColumnMetadata.name is lowercase
column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
return validDataFiles.stream().flatMap(addAction -> {
if (tableHandle.getAnalyzeHandle().isPresent() && !tableHandle.getAnalyzeHandle().get().isInitialAnalyze() && !addAction.isDataChange()) {
// skip files which do not introduce data change on non-initial ANALYZE
return Stream.empty();
}
if (filesModifiedAfter.isPresent() && addAction.getModificationTime() <= filesModifiedAfter.get().toEpochMilli()) {
return Stream.empty();
}
if (maxScannedFileSizeInBytes.isPresent() && addAction.getSize() > maxScannedFileSizeInBytes.get()) {
return Stream.empty();
}
Map<DeltaLakeColumnHandle, Domain> enforcedDomains = enforcedPartitionConstraint.getDomains().orElseThrow();
if (!partitionMatchesPredicate(addAction.getCanonicalPartitionValues(), enforcedDomains)) {
return Stream.empty();
}
TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addAction, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
if (!nonPartitionConstraint.overlaps(statisticsPredicate)) {
return Stream.empty();
}
if (constraint.predicate().isPresent()) {
Map<String, Optional<String>> partitionValues = addAction.getCanonicalPartitionValues();
Map<ColumnHandle, NullableValue> deserializedValues = constraint.getPredicateColumns().orElseThrow().stream().filter(column -> column instanceof DeltaLakeColumnHandle).filter(column -> partitionValues.containsKey(((DeltaLakeColumnHandle) column).getName())).collect(toImmutableMap(identity(), column -> {
DeltaLakeColumnHandle deltaLakeColumn = (DeltaLakeColumnHandle) column;
return NullableValue.of(deltaLakeColumn.getType(), deserializePartitionValue(deltaLakeColumn, addAction.getCanonicalPartitionValues().get(deltaLakeColumn.getName())));
}));
if (!constraint.predicate().get().test(deserializedValues)) {
return Stream.empty();
}
}
return splitsForFile(session, addAction, tableLocation, addAction.getCanonicalPartitionValues(), statisticsPredicate, splittable, remainingInitialSplits).stream();
});
}
use of io.trino.spi.predicate.Domain in project trino by trinodb.
the class DeltaLakeSplitManager method partitionMatchesPredicate.
public static boolean partitionMatchesPredicate(Map<String, Optional<String>> partitionKeys, Map<DeltaLakeColumnHandle, Domain> domains) {
for (Map.Entry<DeltaLakeColumnHandle, Domain> enforcedDomainsEntry : domains.entrySet()) {
DeltaLakeColumnHandle partitionColumn = enforcedDomainsEntry.getKey();
Domain partitionDomain = enforcedDomainsEntry.getValue();
if (!partitionDomain.includesNullableValue(deserializePartitionValue(partitionColumn, partitionKeys.get(partitionColumn.getName())))) {
return false;
}
}
return true;
}
use of io.trino.spi.predicate.Domain in project trino by trinodb.
the class ParquetPageSourceFactory method getParquetTupleDomain.
public static TupleDomain<ColumnDescriptor> getParquetTupleDomain(Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<HiveColumnHandle> effectivePredicate, MessageType fileSchema, boolean useColumnNames) {
if (effectivePredicate.isNone()) {
return TupleDomain.none();
}
ImmutableMap.Builder<ColumnDescriptor, Domain> predicate = ImmutableMap.builder();
for (Entry<HiveColumnHandle, Domain> entry : effectivePredicate.getDomains().get().entrySet()) {
HiveColumnHandle columnHandle = entry.getKey();
// skip looking up predicates for complex types as Parquet only stores stats for primitives
if (columnHandle.getHiveType().getCategory() != PRIMITIVE || columnHandle.getColumnType() != REGULAR) {
continue;
}
RichColumnDescriptor descriptor;
if (useColumnNames) {
descriptor = descriptorsByPath.get(ImmutableList.of(columnHandle.getName()));
} else {
org.apache.parquet.schema.Type parquetField = getParquetType(columnHandle, fileSchema, false);
if (parquetField == null || !parquetField.isPrimitive()) {
// Or the field is a complex type
continue;
}
descriptor = descriptorsByPath.get(ImmutableList.of(parquetField.getName()));
}
if (descriptor != null) {
predicate.put(descriptor, entry.getValue());
}
}
return TupleDomain.withColumnDomains(predicate.buildOrThrow());
}
use of io.trino.spi.predicate.Domain in project trino by trinodb.
the class ParquetPageSourceFactory method getColumnIndexStore.
private static Optional<ColumnIndexStore> getColumnIndexStore(ParquetDataSource dataSource, BlockMetaData blockMetadata, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain, ParquetReaderOptions options) {
if (!options.isUseColumnIndex() || parquetTupleDomain.isAll() || parquetTupleDomain.isNone()) {
return Optional.empty();
}
boolean hasColumnIndex = false;
for (ColumnChunkMetaData column : blockMetadata.getColumns()) {
if (column.getColumnIndexReference() != null && column.getOffsetIndexReference() != null) {
hasColumnIndex = true;
break;
}
}
if (!hasColumnIndex) {
return Optional.empty();
}
Set<ColumnPath> columnsReadPaths = new HashSet<>(descriptorsByPath.size());
for (List<String> path : descriptorsByPath.keySet()) {
columnsReadPaths.add(ColumnPath.get(path.toArray(new String[0])));
}
Map<ColumnDescriptor, Domain> parquetDomains = parquetTupleDomain.getDomains().orElseThrow(() -> new IllegalStateException("Predicate other than none should have domains"));
Set<ColumnPath> columnsFilteredPaths = parquetDomains.keySet().stream().map(column -> ColumnPath.get(column.getPath())).collect(toImmutableSet());
return Optional.of(new TrinoColumnIndexStore(dataSource, blockMetadata, columnsReadPaths, columnsFilteredPaths));
}
Aggregations