use of io.trino.plugin.deltalake.metastore.DeltaLakeMetastore in project trino by trinodb.
the class DeltaLakeSplitManager method getSplits.
private Stream<DeltaLakeSplit> getSplits(ConnectorTransactionHandle transaction, DeltaLakeTableHandle tableHandle, ConnectorSession session, Optional<DataSize> maxScannedFileSize, Set<ColumnHandle> columnsCoveredByDynamicFilter, Constraint constraint) {
DeltaLakeMetastore metastore = getMetastore(session, transaction);
String tableLocation = metastore.getTableLocation(tableHandle.getSchemaTableName(), session);
List<AddFileEntry> validDataFiles = metastore.getValidDataFiles(tableHandle.getSchemaTableName(), session);
TupleDomain<DeltaLakeColumnHandle> enforcedPartitionConstraint = tableHandle.getEnforcedPartitionConstraint();
TupleDomain<DeltaLakeColumnHandle> nonPartitionConstraint = tableHandle.getNonPartitionConstraint();
// Delta Lake handles updates and deletes by copying entire data files, minus updates/deletes. Because of this we can only have one Split/UpdatablePageSource
// per file.
boolean splittable = tableHandle.getWriteType().isEmpty();
AtomicInteger remainingInitialSplits = new AtomicInteger(maxInitialSplits);
Optional<Instant> filesModifiedAfter = tableHandle.getAnalyzeHandle().flatMap(AnalyzeHandle::getFilesModifiedAfter);
Optional<Long> maxScannedFileSizeInBytes = maxScannedFileSize.map(DataSize::toBytes);
Set<String> predicatedColumnNames = Stream.concat(nonPartitionConstraint.getDomains().orElseThrow().keySet().stream(), columnsCoveredByDynamicFilter.stream().map(DeltaLakeColumnHandle.class::cast)).map(// TODO is DeltaLakeColumnHandle.name normalized?
column -> column.getName().toLowerCase(ENGLISH)).collect(toImmutableSet());
List<ColumnMetadata> schema = extractSchema(tableHandle.getMetadataEntry(), typeManager);
List<ColumnMetadata> predicatedColumns = schema.stream().filter(// ColumnMetadata.name is lowercase
column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
return validDataFiles.stream().flatMap(addAction -> {
if (tableHandle.getAnalyzeHandle().isPresent() && !tableHandle.getAnalyzeHandle().get().isInitialAnalyze() && !addAction.isDataChange()) {
// skip files which do not introduce data change on non-initial ANALYZE
return Stream.empty();
}
if (filesModifiedAfter.isPresent() && addAction.getModificationTime() <= filesModifiedAfter.get().toEpochMilli()) {
return Stream.empty();
}
if (maxScannedFileSizeInBytes.isPresent() && addAction.getSize() > maxScannedFileSizeInBytes.get()) {
return Stream.empty();
}
Map<DeltaLakeColumnHandle, Domain> enforcedDomains = enforcedPartitionConstraint.getDomains().orElseThrow();
if (!partitionMatchesPredicate(addAction.getCanonicalPartitionValues(), enforcedDomains)) {
return Stream.empty();
}
TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addAction, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
if (!nonPartitionConstraint.overlaps(statisticsPredicate)) {
return Stream.empty();
}
if (constraint.predicate().isPresent()) {
Map<String, Optional<String>> partitionValues = addAction.getCanonicalPartitionValues();
Map<ColumnHandle, NullableValue> deserializedValues = constraint.getPredicateColumns().orElseThrow().stream().filter(column -> column instanceof DeltaLakeColumnHandle).filter(column -> partitionValues.containsKey(((DeltaLakeColumnHandle) column).getName())).collect(toImmutableMap(identity(), column -> {
DeltaLakeColumnHandle deltaLakeColumn = (DeltaLakeColumnHandle) column;
return NullableValue.of(deltaLakeColumn.getType(), deserializePartitionValue(deltaLakeColumn, addAction.getCanonicalPartitionValues().get(deltaLakeColumn.getName())));
}));
if (!constraint.predicate().get().test(deserializedValues)) {
return Stream.empty();
}
}
return splitsForFile(session, addAction, tableLocation, addAction.getCanonicalPartitionValues(), statisticsPredicate, splittable, remainingInitialSplits).stream();
});
}
use of io.trino.plugin.deltalake.metastore.DeltaLakeMetastore in project trino by trinodb.
the class TestDeltaLakeMetadata method setUp.
@BeforeClass
public void setUp() throws IOException {
temporaryCatalogDirectory = createTempDirectory("HiveCatalog").toFile();
Map<String, String> config = ImmutableMap.<String, String>builder().put("hive.metastore", "file").put("hive.metastore.catalog.dir", temporaryCatalogDirectory.getPath()).buildOrThrow();
Bootstrap app = new Bootstrap(// connector dependencies
new JsonModule(), binder -> {
ConnectorContext context = new TestingConnectorContext();
binder.bind(NodeVersion.class).toInstance(new NodeVersion(context.getNodeManager().getCurrentNode().getVersion()));
binder.bind(CatalogName.class).toInstance(new CatalogName("test"));
binder.bind(TypeManager.class).toInstance(context.getTypeManager());
binder.bind(NodeManager.class).toInstance(context.getNodeManager());
binder.bind(PageIndexerFactory.class).toInstance(context.getPageIndexerFactory());
}, // connector modules
new DeltaLakeMetastoreModule(), new DeltaLakeModule(), // test setup
binder -> {
binder.bind(HdfsEnvironment.class).toInstance(HDFS_ENVIRONMENT);
}, new AbstractModule() {
@Provides
public DeltaLakeMetastore getDeltaLakeMetastore(@RawHiveMetastoreFactory HiveMetastoreFactory hiveMetastoreFactory, TransactionLogAccess transactionLogAccess, TypeManager typeManager, CachingDeltaLakeStatisticsAccess statistics) {
return new HiveMetastoreBackedDeltaLakeMetastore(hiveMetastoreFactory.createMetastore(Optional.empty()), transactionLogAccess, typeManager, statistics);
}
});
Injector injector = app.doNotInitializeLogging().setRequiredConfigurationProperties(config).initialize();
deltaLakeMetadataFactory = injector.getInstance(DeltaLakeMetadataFactory.class);
injector.getInstance(DeltaLakeMetastore.class).createDatabase(Database.builder().setDatabaseName(DATABASE_NAME).setOwnerName(Optional.of("test")).setOwnerType(Optional.of(USER)).setLocation(Optional.empty()).build());
}
Aggregations