Search in sources :

Example 1 with DeltaLakeMetastore

use of io.trino.plugin.deltalake.metastore.DeltaLakeMetastore in project trino by trinodb.

the class DeltaLakeSplitManager method getSplits.

private Stream<DeltaLakeSplit> getSplits(ConnectorTransactionHandle transaction, DeltaLakeTableHandle tableHandle, ConnectorSession session, Optional<DataSize> maxScannedFileSize, Set<ColumnHandle> columnsCoveredByDynamicFilter, Constraint constraint) {
    DeltaLakeMetastore metastore = getMetastore(session, transaction);
    String tableLocation = metastore.getTableLocation(tableHandle.getSchemaTableName(), session);
    List<AddFileEntry> validDataFiles = metastore.getValidDataFiles(tableHandle.getSchemaTableName(), session);
    TupleDomain<DeltaLakeColumnHandle> enforcedPartitionConstraint = tableHandle.getEnforcedPartitionConstraint();
    TupleDomain<DeltaLakeColumnHandle> nonPartitionConstraint = tableHandle.getNonPartitionConstraint();
    // Delta Lake handles updates and deletes by copying entire data files, minus updates/deletes. Because of this we can only have one Split/UpdatablePageSource
    // per file.
    boolean splittable = tableHandle.getWriteType().isEmpty();
    AtomicInteger remainingInitialSplits = new AtomicInteger(maxInitialSplits);
    Optional<Instant> filesModifiedAfter = tableHandle.getAnalyzeHandle().flatMap(AnalyzeHandle::getFilesModifiedAfter);
    Optional<Long> maxScannedFileSizeInBytes = maxScannedFileSize.map(DataSize::toBytes);
    Set<String> predicatedColumnNames = Stream.concat(nonPartitionConstraint.getDomains().orElseThrow().keySet().stream(), columnsCoveredByDynamicFilter.stream().map(DeltaLakeColumnHandle.class::cast)).map(// TODO is DeltaLakeColumnHandle.name normalized?
    column -> column.getName().toLowerCase(ENGLISH)).collect(toImmutableSet());
    List<ColumnMetadata> schema = extractSchema(tableHandle.getMetadataEntry(), typeManager);
    List<ColumnMetadata> predicatedColumns = schema.stream().filter(// ColumnMetadata.name is lowercase
    column -> predicatedColumnNames.contains(column.getName())).collect(toImmutableList());
    return validDataFiles.stream().flatMap(addAction -> {
        if (tableHandle.getAnalyzeHandle().isPresent() && !tableHandle.getAnalyzeHandle().get().isInitialAnalyze() && !addAction.isDataChange()) {
            // skip files which do not introduce data change on non-initial ANALYZE
            return Stream.empty();
        }
        if (filesModifiedAfter.isPresent() && addAction.getModificationTime() <= filesModifiedAfter.get().toEpochMilli()) {
            return Stream.empty();
        }
        if (maxScannedFileSizeInBytes.isPresent() && addAction.getSize() > maxScannedFileSizeInBytes.get()) {
            return Stream.empty();
        }
        Map<DeltaLakeColumnHandle, Domain> enforcedDomains = enforcedPartitionConstraint.getDomains().orElseThrow();
        if (!partitionMatchesPredicate(addAction.getCanonicalPartitionValues(), enforcedDomains)) {
            return Stream.empty();
        }
        TupleDomain<DeltaLakeColumnHandle> statisticsPredicate = createStatisticsPredicate(addAction, predicatedColumns, tableHandle.getMetadataEntry().getCanonicalPartitionColumns());
        if (!nonPartitionConstraint.overlaps(statisticsPredicate)) {
            return Stream.empty();
        }
        if (constraint.predicate().isPresent()) {
            Map<String, Optional<String>> partitionValues = addAction.getCanonicalPartitionValues();
            Map<ColumnHandle, NullableValue> deserializedValues = constraint.getPredicateColumns().orElseThrow().stream().filter(column -> column instanceof DeltaLakeColumnHandle).filter(column -> partitionValues.containsKey(((DeltaLakeColumnHandle) column).getName())).collect(toImmutableMap(identity(), column -> {
                DeltaLakeColumnHandle deltaLakeColumn = (DeltaLakeColumnHandle) column;
                return NullableValue.of(deltaLakeColumn.getType(), deserializePartitionValue(deltaLakeColumn, addAction.getCanonicalPartitionValues().get(deltaLakeColumn.getName())));
            }));
            if (!constraint.predicate().get().test(deserializedValues)) {
                return Stream.empty();
            }
        }
        return splitsForFile(session, addAction, tableLocation, addAction.getCanonicalPartitionValues(), statisticsPredicate, splittable, remainingInitialSplits).stream();
    });
}
Also used : ConnectorSplitManager(io.trino.spi.connector.ConnectorSplitManager) Constraint(io.trino.spi.connector.Constraint) DeltaLakeSessionProperties.getMaxInitialSplitSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getMaxInitialSplitSize) URLDecoder(java.net.URLDecoder) NullableValue(io.trino.spi.predicate.NullableValue) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) BiFunction(java.util.function.BiFunction) DeltaLakeSchemaSupport.extractSchema(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractSchema) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) FixedSplitSource(io.trino.spi.connector.FixedSplitSource) Inject(javax.inject.Inject) DeltaLakeMetastore(io.trino.plugin.deltalake.metastore.DeltaLakeMetastore) DeltaLakeMetadata.createStatisticsPredicate(io.trino.plugin.deltalake.DeltaLakeMetadata.createStatisticsPredicate) ImmutableList(com.google.common.collect.ImmutableList) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ConnectorTableHandle(io.trino.spi.connector.ConnectorTableHandle) Map(java.util.Map) Objects.requireNonNull(java.util.Objects.requireNonNull) ColumnHandle(io.trino.spi.connector.ColumnHandle) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ENGLISH(java.util.Locale.ENGLISH) ExecutorService(java.util.concurrent.ExecutorService) UTF_8(java.nio.charset.StandardCharsets.UTF_8) Domain(io.trino.spi.predicate.Domain) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) ConnectorSplitSource(io.trino.spi.connector.ConnectorSplitSource) ConnectorSession(io.trino.spi.connector.ConnectorSession) TupleDomain(io.trino.spi.predicate.TupleDomain) Instant(java.time.Instant) DeltaLakeSessionProperties.getMaxSplitSize(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getMaxSplitSize) DataSize(io.airlift.units.DataSize) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) ClassLoaderSafeConnectorSplitSource(io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitSource) Stream(java.util.stream.Stream) DynamicFilter(io.trino.spi.connector.DynamicFilter) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) TypeManager(io.trino.spi.type.TypeManager) TransactionLogParser.deserializePartitionValue(io.trino.plugin.deltalake.transactionlog.TransactionLogParser.deserializePartitionValue) HiveTransactionHandle(io.trino.plugin.hive.HiveTransactionHandle) ConnectorTransactionHandle(io.trino.spi.connector.ConnectorTransactionHandle) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) DataSize(io.airlift.units.DataSize) ColumnHandle(io.trino.spi.connector.ColumnHandle) Optional(java.util.Optional) Instant(java.time.Instant) NullableValue(io.trino.spi.predicate.NullableValue) DeltaLakeMetastore(io.trino.plugin.deltalake.metastore.DeltaLakeMetastore) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) Domain(io.trino.spi.predicate.Domain) TupleDomain(io.trino.spi.predicate.TupleDomain)

Example 2 with DeltaLakeMetastore

use of io.trino.plugin.deltalake.metastore.DeltaLakeMetastore in project trino by trinodb.

the class TestDeltaLakeMetadata method setUp.

@BeforeClass
public void setUp() throws IOException {
    temporaryCatalogDirectory = createTempDirectory("HiveCatalog").toFile();
    Map<String, String> config = ImmutableMap.<String, String>builder().put("hive.metastore", "file").put("hive.metastore.catalog.dir", temporaryCatalogDirectory.getPath()).buildOrThrow();
    Bootstrap app = new Bootstrap(// connector dependencies
    new JsonModule(), binder -> {
        ConnectorContext context = new TestingConnectorContext();
        binder.bind(NodeVersion.class).toInstance(new NodeVersion(context.getNodeManager().getCurrentNode().getVersion()));
        binder.bind(CatalogName.class).toInstance(new CatalogName("test"));
        binder.bind(TypeManager.class).toInstance(context.getTypeManager());
        binder.bind(NodeManager.class).toInstance(context.getNodeManager());
        binder.bind(PageIndexerFactory.class).toInstance(context.getPageIndexerFactory());
    }, // connector modules
    new DeltaLakeMetastoreModule(), new DeltaLakeModule(), // test setup
    binder -> {
        binder.bind(HdfsEnvironment.class).toInstance(HDFS_ENVIRONMENT);
    }, new AbstractModule() {

        @Provides
        public DeltaLakeMetastore getDeltaLakeMetastore(@RawHiveMetastoreFactory HiveMetastoreFactory hiveMetastoreFactory, TransactionLogAccess transactionLogAccess, TypeManager typeManager, CachingDeltaLakeStatisticsAccess statistics) {
            return new HiveMetastoreBackedDeltaLakeMetastore(hiveMetastoreFactory.createMetastore(Optional.empty()), transactionLogAccess, typeManager, statistics);
        }
    });
    Injector injector = app.doNotInitializeLogging().setRequiredConfigurationProperties(config).initialize();
    deltaLakeMetadataFactory = injector.getInstance(DeltaLakeMetadataFactory.class);
    injector.getInstance(DeltaLakeMetastore.class).createDatabase(Database.builder().setDatabaseName(DATABASE_NAME).setOwnerName(Optional.of("test")).setOwnerType(Optional.of(USER)).setLocation(Optional.empty()).build());
}
Also used : HiveMetastoreFactory(io.trino.plugin.hive.metastore.HiveMetastoreFactory) RawHiveMetastoreFactory(io.trino.plugin.hive.metastore.RawHiveMetastoreFactory) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) TestingConnectorContext(io.trino.testing.TestingConnectorContext) Provides(com.google.inject.Provides) DeltaLakeMetastore(io.trino.plugin.deltalake.metastore.DeltaLakeMetastore) HiveMetastoreBackedDeltaLakeMetastore(io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore) JsonModule(io.airlift.json.JsonModule) PageIndexerFactory(io.trino.spi.PageIndexerFactory) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) AbstractModule(com.google.inject.AbstractModule) NodeVersion(io.trino.plugin.hive.NodeVersion) NodeManager(io.trino.spi.NodeManager) HiveMetastoreBackedDeltaLakeMetastore(io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore) Injector(com.google.inject.Injector) ConnectorContext(io.trino.spi.connector.ConnectorContext) TestingConnectorContext(io.trino.testing.TestingConnectorContext) DeltaLakeMetastoreModule(io.trino.plugin.deltalake.metastore.DeltaLakeMetastoreModule) Bootstrap(io.airlift.bootstrap.Bootstrap) TypeManager(io.trino.spi.type.TypeManager) CatalogName(io.trino.plugin.base.CatalogName) CachingDeltaLakeStatisticsAccess(io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess) BeforeClass(org.testng.annotations.BeforeClass)

Aggregations

DeltaLakeMetastore (io.trino.plugin.deltalake.metastore.DeltaLakeMetastore)2 TypeManager (io.trino.spi.type.TypeManager)2 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)1 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)1 AbstractModule (com.google.inject.AbstractModule)1 Injector (com.google.inject.Injector)1 Provides (com.google.inject.Provides)1 Bootstrap (io.airlift.bootstrap.Bootstrap)1 JsonModule (io.airlift.json.JsonModule)1 DataSize (io.airlift.units.DataSize)1 CatalogName (io.trino.plugin.base.CatalogName)1 ClassLoaderSafeConnectorSplitSource (io.trino.plugin.base.classloader.ClassLoaderSafeConnectorSplitSource)1 DeltaLakeMetadata.createStatisticsPredicate (io.trino.plugin.deltalake.DeltaLakeMetadata.createStatisticsPredicate)1 DeltaLakeSessionProperties.getMaxInitialSplitSize (io.trino.plugin.deltalake.DeltaLakeSessionProperties.getMaxInitialSplitSize)1 DeltaLakeSessionProperties.getMaxSplitSize (io.trino.plugin.deltalake.DeltaLakeSessionProperties.getMaxSplitSize)1 DeltaLakeMetastoreModule (io.trino.plugin.deltalake.metastore.DeltaLakeMetastoreModule)1 HiveMetastoreBackedDeltaLakeMetastore (io.trino.plugin.deltalake.metastore.HiveMetastoreBackedDeltaLakeMetastore)1 CachingDeltaLakeStatisticsAccess (io.trino.plugin.deltalake.statistics.CachingDeltaLakeStatisticsAccess)1