Search in sources :

Example 1 with TRANSACTION_LOG_DIRECTORY

use of io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY in project trino by trinodb.

the class TestTransactionLogAccess method testSnapshotsAreConsistent.

@Test
public void testSnapshotsAreConsistent() throws Exception {
    String tableName = "person";
    File tempDir = Files.createTempDir();
    File tableDir = new File(tempDir, tableName);
    File transactionLogDir = new File(tableDir, TRANSACTION_LOG_DIRECTORY);
    transactionLogDir.mkdirs();
    File resourceDir = new File(getClass().getClassLoader().getResource("databricks/person/_delta_log").toURI());
    copyTransactionLogEntry(0, 12, resourceDir, transactionLogDir);
    Files.copy(new File(resourceDir, LAST_CHECKPOINT_FILENAME), new File(transactionLogDir, LAST_CHECKPOINT_FILENAME));
    setupTransactionLogAccess(tableName, new Path(tableDir.toURI()));
    List<AddFileEntry> expectedDataFiles = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
    copyTransactionLogEntry(12, 14, resourceDir, transactionLogDir);
    Set<String> newDataFiles = ImmutableSet.of("age=28/part-00000-40dd1707-1d42-4328-a59a-21f5c945fe60.c000.snappy.parquet", "age=29/part-00000-3794c463-cb0c-4beb-8d07-7cc1e3b5920f.c000.snappy.parquet");
    TableSnapshot updatedTableSnapshot = transactionLogAccess.loadSnapshot(new SchemaTableName("schema", tableName), new Path(tableDir.toURI()), SESSION);
    List<AddFileEntry> allDataFiles = transactionLogAccess.getActiveFiles(updatedTableSnapshot, SESSION);
    List<AddFileEntry> dataFilesWithFixedVersion = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
    for (String newFilePath : newDataFiles) {
        assertTrue(allDataFiles.stream().anyMatch(entry -> entry.getPath().equals(newFilePath)));
        assertTrue(dataFilesWithFixedVersion.stream().noneMatch(entry -> entry.getPath().equals(newFilePath)));
    }
    assertEquals(expectedDataFiles.size(), dataFilesWithFixedVersion.size());
    List<ColumnMetadata> columns = extractSchema(transactionLogAccess.getMetadataEntry(tableSnapshot, SESSION).get(), TESTING_TYPE_MANAGER);
    for (int i = 0; i < expectedDataFiles.size(); i++) {
        AddFileEntry expected = expectedDataFiles.get(i);
        AddFileEntry actual = dataFilesWithFixedVersion.get(i);
        assertEquals(expected.getPath(), actual.getPath());
        assertEquals(expected.getPartitionValues(), actual.getPartitionValues());
        assertEquals(expected.getSize(), actual.getSize());
        assertEquals(expected.getModificationTime(), actual.getModificationTime());
        assertEquals(expected.isDataChange(), actual.isDataChange());
        assertEquals(expected.getTags(), actual.getTags());
        assertTrue(expected.getStats().isPresent());
        assertTrue(actual.getStats().isPresent());
        for (ColumnMetadata column : columns) {
            DeltaLakeColumnHandle columnHandle = new DeltaLakeColumnHandle(column.getName(), column.getType(), REGULAR);
            assertEquals(expected.getStats().get().getMinColumnValue(columnHandle), actual.getStats().get().getMinColumnValue(columnHandle));
            assertEquals(expected.getStats().get().getMaxColumnValue(columnHandle), actual.getStats().get().getMaxColumnValue(columnHandle));
            assertEquals(expected.getStats().get().getNullCount(columnHandle.getName()), actual.getStats().get().getNullCount(columnHandle.getName()));
            assertEquals(expected.getStats().get().getNumRecords(), actual.getStats().get().getNumRecords());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Test(org.testng.annotations.Test) DeltaLakeSchemaSupport.extractSchema(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractSchema) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) BigDecimal(java.math.BigDecimal) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) Map(java.util.Map) Sets.union(com.google.common.collect.Sets.union) Path(org.apache.hadoop.fs.Path) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) LAST_CHECKPOINT_FILENAME(io.trino.plugin.deltalake.transactionlog.TransactionLogParser.LAST_CHECKPOINT_FILENAME) Assert.assertFalse(org.testng.Assert.assertFalse) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) TRANSACTION_LOG_DIRECTORY(io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) UTC_KEY(io.trino.spi.type.TimeZoneKey.UTC_KEY) SESSION(io.trino.testing.TestingConnectorSession.SESSION) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Set(java.util.Set) Collectors(java.util.stream.Collectors) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) List(java.util.List) Stream(java.util.stream.Stream) HdfsConfig(io.trino.plugin.hive.HdfsConfig) LocalDate(java.time.LocalDate) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) Decimals(io.trino.spi.type.Decimals) UTC(java.time.ZoneOffset.UTC) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) DataProvider(org.testng.annotations.DataProvider) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) LocalDateTime(java.time.LocalDateTime) DateTimeEncoding(io.trino.spi.type.DateTimeEncoding) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) IntegerType(io.trino.spi.type.IntegerType) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) IOException(java.io.IOException) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) TupleDomain(io.trino.spi.predicate.TupleDomain) TestingConnectorContext(io.trino.testing.TestingConnectorContext) File(java.io.File) Assertions.assertEqualsIgnoreOrder(io.airlift.testing.Assertions.assertEqualsIgnoreOrder) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) Assert.assertTrue(org.testng.Assert.assertTrue) CommitInfoEntry(io.trino.plugin.deltalake.transactionlog.CommitInfoEntry) TypeManager(io.trino.spi.type.TypeManager) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) SchemaTableName(io.trino.spi.connector.SchemaTableName) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) File(java.io.File) Test(org.testng.annotations.Test)

Example 2 with TRANSACTION_LOG_DIRECTORY

use of io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY in project trino by trinodb.

the class VacuumProcedure method doVacuum.

private void doVacuum(ConnectorSession session, String schema, String table, String retention) throws IOException {
    checkProcedureArgument(schema != null, "schema_name cannot be null");
    checkProcedureArgument(!schema.isEmpty(), "schema_name cannot be empty");
    checkProcedureArgument(table != null, "table_name cannot be null");
    checkProcedureArgument(!table.isEmpty(), "table_name cannot be empty");
    checkProcedureArgument(retention != null, "retention cannot be null");
    Duration retentionDuration = Duration.valueOf(retention);
    Duration minRetention = getVacuumMinRetention(session);
    checkProcedureArgument(retentionDuration.compareTo(minRetention) >= 0, "Retention specified (%s) is shorter than the minimum retention configured in the system (%s). " + "Minimum retention can be changed with %s configuration property or %s.%s session property", retentionDuration, minRetention, DeltaLakeConfig.VACUUM_MIN_RETENTION, catalogName, DeltaLakeSessionProperties.VACUUM_MIN_RETENTION);
    Instant threshold = Instant.now().minusMillis(retentionDuration.toMillis());
    DeltaLakeMetadata metadata = metadataFactory.create(session.getIdentity());
    SchemaTableName tableName = new SchemaTableName(schema, table);
    DeltaLakeTableHandle handle = metadata.getTableHandle(session, tableName);
    checkProcedureArgument(handle != null, "Table '%s' does not exist", tableName);
    TableSnapshot tableSnapshot = transactionLogAccess.loadSnapshot(tableName, new Path(handle.getLocation()), session);
    Path tableLocation = tableSnapshot.getTableLocation();
    Path transactionLogDir = getTransactionLogDir(tableLocation);
    FileSystem fileSystem = hdfsEnvironment.getFileSystem(new HdfsEnvironment.HdfsContext(session), tableLocation);
    String commonPathPrefix = tableLocation + "/";
    String queryId = session.getQueryId();
    // Retain all active files and every file removed by a "recent" transaction (except for the oldest "recent").
    // Any remaining file are not live, and not needed to read any "recent" snapshot.
    List<Long> recentVersions = transactionLogAccess.getPastTableVersions(fileSystem, transactionLogDir, threshold, tableSnapshot.getVersion());
    Set<String> retainedPaths = Stream.concat(transactionLogAccess.getActiveFiles(tableSnapshot, session).stream().map(AddFileEntry::getPath), transactionLogAccess.getJsonEntries(fileSystem, transactionLogDir, // active files, but still needed to read a "recent" snapshot
    recentVersions.stream().sorted(naturalOrder()).skip(1).collect(toImmutableList())).map(DeltaLakeTransactionLogEntry::getRemove).filter(Objects::nonNull).map(RemoveFileEntry::getPath)).peek(path -> checkState(!path.startsWith(tableLocation.toString()), "Unexpected absolute path in transaction log: %s", path)).collect(toImmutableSet());
    log.debug("[%s] attempting to vacuum table %s [%s] with %s retention (expiry threshold %s). %s data file paths marked for retention", queryId, tableName, tableLocation, retention, threshold, retainedPaths.size());
    long nonFiles = 0;
    long allPathsChecked = 0;
    long transactionLogFiles = 0;
    long retainedKnownFiles = 0;
    long retainedUnknownFiles = 0;
    long removedFiles = 0;
    RemoteIterator<LocatedFileStatus> listing = fileSystem.listFiles(tableLocation, true);
    while (listing.hasNext()) {
        LocatedFileStatus fileStatus = listing.next();
        Path path = fileStatus.getPath();
        checkState(path.toString().startsWith(commonPathPrefix), "Unexpected path [%s] returned when listing files under [%s]", path, tableLocation);
        String relativePath = path.toString().substring(commonPathPrefix.length());
        if (relativePath.isEmpty()) {
            // A file returned for "tableLocation/", might be possible on S3.
            continue;
        }
        allPathsChecked++;
        // TODO Note: Databricks can delete directories during vacuum on s3. This might need to be revisited.
        if (!fileStatus.isFile()) {
            nonFiles++;
            continue;
        }
        // ignore tableLocation/_delta_log/**
        if (relativePath.equals(TRANSACTION_LOG_DIRECTORY) || relativePath.startsWith(TRANSACTION_LOG_DIRECTORY + "/")) {
            log.debug("[%s] skipping a file inside transaction log dir: %s", queryId, path);
            transactionLogFiles++;
            continue;
        }
        // skip retained files
        if (retainedPaths.contains(relativePath)) {
            log.debug("[%s] retaining a known file: %s", queryId, path);
            retainedKnownFiles++;
            continue;
        }
        // ignore recently created files
        long modificationTime = fileStatus.getModificationTime();
        Instant modificationInstant = Instant.ofEpochMilli(modificationTime);
        if (!modificationInstant.isBefore(threshold)) {
            log.debug("[%s] retaining an unknown file %s with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
            retainedUnknownFiles++;
            continue;
        }
        log.debug("[%s] deleting file [%s] with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
        if (!fileSystem.delete(path, false)) {
            throw new TrinoException(GENERIC_INTERNAL_ERROR, "Failed to delete file: " + path);
        }
        removedFiles++;
    }
    log.info("[%s] finished vacuuming table %s [%s]: files checked: %s; non-files: %s; metadata files: %s; retained known files: %s; retained unknown files: %s; removed files: %s", queryId, tableName, tableLocation, allPathsChecked, nonFiles, transactionLogFiles, retainedKnownFiles, retainedUnknownFiles, removedFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) TransactionLogUtil.getTransactionLogDir(io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogDir) MethodHandle(java.lang.invoke.MethodHandle) Provider(javax.inject.Provider) Comparator.naturalOrder(java.util.Comparator.naturalOrder) Logger(io.airlift.log.Logger) FileSystem(org.apache.hadoop.fs.FileSystem) DeltaLakeMetadataFactory(io.trino.plugin.deltalake.DeltaLakeMetadataFactory) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) MethodHandleUtil.methodHandle(io.trino.spi.block.MethodHandleUtil.methodHandle) DeltaLakeSessionProperties.getVacuumMinRetention(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getVacuumMinRetention) Duration(io.airlift.units.Duration) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) DeltaLakeTransactionLogEntry(io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) Inject(javax.inject.Inject) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) ImmutableList(com.google.common.collect.ImmutableList) Procedure(io.trino.spi.procedure.Procedure) DeltaLakeMetadata(io.trino.plugin.deltalake.DeltaLakeMetadata) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Procedures.checkProcedureArgument(io.trino.plugin.deltalake.procedure.Procedures.checkProcedureArgument) Argument(io.trino.spi.procedure.Procedure.Argument) TRANSACTION_LOG_DIRECTORY(io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) Instant(java.time.Instant) CatalogName(io.trino.plugin.base.CatalogName) ThreadContextClassLoader(io.trino.spi.classloader.ThreadContextClassLoader) SchemaTableName(io.trino.spi.connector.SchemaTableName) GENERIC_INTERNAL_ERROR(io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) DeltaLakeConfig(io.trino.plugin.deltalake.DeltaLakeConfig) DeltaLakeSessionProperties(io.trino.plugin.deltalake.DeltaLakeSessionProperties) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Instant(java.time.Instant) DeltaLakeMetadata(io.trino.plugin.deltalake.DeltaLakeMetadata) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Duration(io.airlift.units.Duration) SchemaTableName(io.trino.spi.connector.SchemaTableName) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) FileSystem(org.apache.hadoop.fs.FileSystem) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Objects(java.util.Objects) TrinoException(io.trino.spi.TrinoException) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)2 AddFileEntry (io.trino.plugin.deltalake.transactionlog.AddFileEntry)2 RemoveFileEntry (io.trino.plugin.deltalake.transactionlog.RemoveFileEntry)2 TableSnapshot (io.trino.plugin.deltalake.transactionlog.TableSnapshot)2 TRANSACTION_LOG_DIRECTORY (io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY)2 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)2 SchemaTableName (io.trino.spi.connector.SchemaTableName)2 IOException (java.io.IOException)2 String.format (java.lang.String.format)2 List (java.util.List)2 Set (java.util.Set)2 Stream (java.util.stream.Stream)2 Path (org.apache.hadoop.fs.Path)2 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)1 Sets.union (com.google.common.collect.Sets.union)1 Files (com.google.common.io.Files)1