Search in sources :

Example 1 with RemoveFileEntry

use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.

the class TestCheckpointEntryIterator method testReadAllEntries.

@Test
public void testReadAllEntries() throws Exception {
    URI checkpointUri = getResource(TEST_CHECKPOINT).toURI();
    MetadataEntry metadataEntry = readMetadataEntry(checkpointUri);
    CheckpointEntryIterator checkpointEntryIterator = createCheckpointEntryIterator(checkpointUri, ImmutableSet.of(METADATA, PROTOCOL, TRANSACTION, ADD, REMOVE, COMMIT), Optional.of(readMetadataEntry(checkpointUri)));
    List<DeltaLakeTransactionLogEntry> entries = ImmutableList.copyOf(checkpointEntryIterator);
    assertThat(entries).hasSize(17);
    // MetadataEntry
    assertThat(entries).element(12).extracting(DeltaLakeTransactionLogEntry::getMetaData).isEqualTo(metadataEntry);
    // ProtocolEntry
    assertThat(entries).element(11).extracting(DeltaLakeTransactionLogEntry::getProtocol).isEqualTo(new ProtocolEntry(1, 2));
    // TransactionEntry
    // not found in the checkpoint, TODO add a test
    assertThat(entries).map(DeltaLakeTransactionLogEntry::getTxn).filteredOn(Objects::nonNull).isEmpty();
    // AddFileEntry
    assertThat(entries).element(8).extracting(DeltaLakeTransactionLogEntry::getAdd).isEqualTo(new AddFileEntry("age=42/part-00003-0f53cae3-3e34-4876-b651-e1db9584dbc3.c000.snappy.parquet", Map.of("age", "42"), 2634, 1579190165000L, false, Optional.of("{" + "\"numRecords\":1," + "\"minValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"maxValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"nullCount\":{\"name\":0,\"married\":0,\"phones\":0,\"address\":{\"street\":0,\"city\":0,\"state\":0,\"zip\":0},\"income\":0}" + "}"), Optional.empty(), null));
    // RemoveFileEntry
    assertThat(entries).element(3).extracting(DeltaLakeTransactionLogEntry::getRemove).isEqualTo(new RemoveFileEntry("age=42/part-00000-951068bd-bcf4-4094-bb94-536f3c41d31f.c000.snappy.parquet", 1579190155406L, false));
    // CommitInfoEntry
    // not found in the checkpoint, TODO add a test
    assertThat(entries).map(DeltaLakeTransactionLogEntry::getCommitInfo).filteredOn(Objects::nonNull).isEmpty();
}
Also used : ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) DeltaLakeTransactionLogEntry(io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) URI(java.net.URI) Test(org.testng.annotations.Test)

Example 2 with RemoveFileEntry

use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.

the class TestCheckpointBuilder method testCheckpointBuilder.

@Test
public void testCheckpointBuilder() {
    CheckpointBuilder builder = new CheckpointBuilder();
    MetadataEntry metadata1 = new MetadataEntry("1", "", "", new MetadataEntry.Format("", Map.of()), "", List.of(), Map.of(), 1);
    MetadataEntry metadata2 = new MetadataEntry("2", "", "", new MetadataEntry.Format("", Map.of()), "", List.of(), Map.of(), 1);
    builder.addLogEntry(metadataEntry(metadata1));
    builder.addLogEntry(metadataEntry(metadata2));
    ProtocolEntry protocol1 = new ProtocolEntry(1, 2);
    ProtocolEntry protocol2 = new ProtocolEntry(3, 4);
    builder.addLogEntry(protocolEntry(protocol1));
    builder.addLogEntry(protocolEntry(protocol2));
    TransactionEntry app1TransactionV1 = new TransactionEntry("app1", 1, 1);
    TransactionEntry app1TransactionV2 = new TransactionEntry("app1", 2, 2);
    TransactionEntry app1TransactionV3 = new TransactionEntry("app1", 3, 3);
    TransactionEntry app2TransactionV5 = new TransactionEntry("app2", 5, 5);
    builder.addLogEntry(transactionEntry(app1TransactionV2));
    builder.addLogEntry(transactionEntry(app1TransactionV3));
    builder.addLogEntry(transactionEntry(app1TransactionV1));
    builder.addLogEntry(transactionEntry(app2TransactionV5));
    AddFileEntry addA1 = new AddFileEntry("a", Map.of(), 1, 1, true, Optional.empty(), Optional.empty(), Map.of());
    RemoveFileEntry removeA1 = new RemoveFileEntry("a", 1, true);
    AddFileEntry addA2 = new AddFileEntry("a", Map.of(), 2, 1, true, Optional.empty(), Optional.empty(), Map.of());
    AddFileEntry addB = new AddFileEntry("b", Map.of(), 1, 1, true, Optional.empty(), Optional.empty(), Map.of());
    RemoveFileEntry removeB = new RemoveFileEntry("b", 1, true);
    RemoveFileEntry removeC = new RemoveFileEntry("c", 1, true);
    builder.addLogEntry(addFileEntry(addA1));
    builder.addLogEntry(removeFileEntry(removeA1));
    builder.addLogEntry(addFileEntry(addA2));
    builder.addLogEntry(addFileEntry(addB));
    builder.addLogEntry(removeFileEntry(removeB));
    builder.addLogEntry(removeFileEntry(removeC));
    CheckpointEntries expectedCheckpoint = new CheckpointEntries(metadata2, protocol2, Set.of(app1TransactionV3, app2TransactionV5), Set.of(addA2), Set.of(removeB, removeC));
    assertEquals(expectedCheckpoint, builder.build());
}
Also used : ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) TransactionEntry(io.trino.plugin.deltalake.transactionlog.TransactionEntry) Test(org.testng.annotations.Test)

Example 3 with RemoveFileEntry

use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.

the class CheckpointWriter method write.

public void write(ConnectorSession session, CheckpointEntries entries, Path targetPath) {
    RowType metadataEntryType = checkpointSchemaManager.getMetadataEntryType();
    RowType protocolEntryType = checkpointSchemaManager.getProtocolEntryType();
    RowType txnEntryType = checkpointSchemaManager.getTxnEntryType();
    RowType addEntryType = checkpointSchemaManager.getAddEntryType(entries.getMetadataEntry());
    RowType removeEntryType = checkpointSchemaManager.getRemoveEntryType();
    List<String> columnNames = ImmutableList.of("metaData", "protocol", "txn", "add", "remove");
    List<Type> columnTypes = ImmutableList.of(metadataEntryType, protocolEntryType, txnEntryType, addEntryType, removeEntryType);
    Properties schema = buildSchemaProperties(columnNames, columnTypes);
    Configuration conf = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session), targetPath);
    configureCompression(conf, SNAPPY);
    JobConf jobConf = toJobConf(conf);
    RecordFileWriter writer = new RecordFileWriter(targetPath, columnNames, fromHiveStorageFormat(PARQUET), schema, PARQUET.getEstimatedWriterMemoryUsage(), jobConf, typeManager, DateTimeZone.UTC, session);
    PageBuilder pageBuilder = new PageBuilder(columnTypes);
    writeMetadataEntry(pageBuilder, metadataEntryType, entries.getMetadataEntry());
    writeProtocolEntry(pageBuilder, protocolEntryType, entries.getProtocolEntry());
    for (TransactionEntry transactionEntry : entries.getTransactionEntries()) {
        writeTransactionEntry(pageBuilder, txnEntryType, transactionEntry);
    }
    for (AddFileEntry addFileEntry : entries.getAddFileEntries()) {
        writeAddFileEntry(pageBuilder, addEntryType, addFileEntry);
    }
    for (RemoveFileEntry removeFileEntry : entries.getRemoveFileEntries()) {
        writeRemoveFileEntry(pageBuilder, removeEntryType, removeFileEntry);
    }
    // Not writing commit infos for now. DB does not keep them in the checkpoints by default
    writer.appendRows(pageBuilder.build());
    writer.commit();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) RowType(io.trino.spi.type.RowType) PageBuilder(io.trino.spi.PageBuilder) Properties(java.util.Properties) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) Type(io.trino.spi.type.Type) TimestampType(io.trino.spi.type.TimestampType) HiveType(io.trino.plugin.hive.HiveType) RowType(io.trino.spi.type.RowType) MapType(io.trino.spi.type.MapType) ArrayType(io.trino.spi.type.ArrayType) RecordFileWriter(io.trino.plugin.hive.RecordFileWriter) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) ConfigurationUtils.toJobConf(io.trino.plugin.hive.util.ConfigurationUtils.toJobConf) JobConf(org.apache.hadoop.mapred.JobConf) TransactionEntry(io.trino.plugin.deltalake.transactionlog.TransactionEntry)

Example 4 with RemoveFileEntry

use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.

the class VacuumProcedure method doVacuum.

private void doVacuum(ConnectorSession session, String schema, String table, String retention) throws IOException {
    checkProcedureArgument(schema != null, "schema_name cannot be null");
    checkProcedureArgument(!schema.isEmpty(), "schema_name cannot be empty");
    checkProcedureArgument(table != null, "table_name cannot be null");
    checkProcedureArgument(!table.isEmpty(), "table_name cannot be empty");
    checkProcedureArgument(retention != null, "retention cannot be null");
    Duration retentionDuration = Duration.valueOf(retention);
    Duration minRetention = getVacuumMinRetention(session);
    checkProcedureArgument(retentionDuration.compareTo(minRetention) >= 0, "Retention specified (%s) is shorter than the minimum retention configured in the system (%s). " + "Minimum retention can be changed with %s configuration property or %s.%s session property", retentionDuration, minRetention, DeltaLakeConfig.VACUUM_MIN_RETENTION, catalogName, DeltaLakeSessionProperties.VACUUM_MIN_RETENTION);
    Instant threshold = Instant.now().minusMillis(retentionDuration.toMillis());
    DeltaLakeMetadata metadata = metadataFactory.create(session.getIdentity());
    SchemaTableName tableName = new SchemaTableName(schema, table);
    DeltaLakeTableHandle handle = metadata.getTableHandle(session, tableName);
    checkProcedureArgument(handle != null, "Table '%s' does not exist", tableName);
    TableSnapshot tableSnapshot = transactionLogAccess.loadSnapshot(tableName, new Path(handle.getLocation()), session);
    Path tableLocation = tableSnapshot.getTableLocation();
    Path transactionLogDir = getTransactionLogDir(tableLocation);
    FileSystem fileSystem = hdfsEnvironment.getFileSystem(new HdfsEnvironment.HdfsContext(session), tableLocation);
    String commonPathPrefix = tableLocation + "/";
    String queryId = session.getQueryId();
    // Retain all active files and every file removed by a "recent" transaction (except for the oldest "recent").
    // Any remaining file are not live, and not needed to read any "recent" snapshot.
    List<Long> recentVersions = transactionLogAccess.getPastTableVersions(fileSystem, transactionLogDir, threshold, tableSnapshot.getVersion());
    Set<String> retainedPaths = Stream.concat(transactionLogAccess.getActiveFiles(tableSnapshot, session).stream().map(AddFileEntry::getPath), transactionLogAccess.getJsonEntries(fileSystem, transactionLogDir, // active files, but still needed to read a "recent" snapshot
    recentVersions.stream().sorted(naturalOrder()).skip(1).collect(toImmutableList())).map(DeltaLakeTransactionLogEntry::getRemove).filter(Objects::nonNull).map(RemoveFileEntry::getPath)).peek(path -> checkState(!path.startsWith(tableLocation.toString()), "Unexpected absolute path in transaction log: %s", path)).collect(toImmutableSet());
    log.debug("[%s] attempting to vacuum table %s [%s] with %s retention (expiry threshold %s). %s data file paths marked for retention", queryId, tableName, tableLocation, retention, threshold, retainedPaths.size());
    long nonFiles = 0;
    long allPathsChecked = 0;
    long transactionLogFiles = 0;
    long retainedKnownFiles = 0;
    long retainedUnknownFiles = 0;
    long removedFiles = 0;
    RemoteIterator<LocatedFileStatus> listing = fileSystem.listFiles(tableLocation, true);
    while (listing.hasNext()) {
        LocatedFileStatus fileStatus = listing.next();
        Path path = fileStatus.getPath();
        checkState(path.toString().startsWith(commonPathPrefix), "Unexpected path [%s] returned when listing files under [%s]", path, tableLocation);
        String relativePath = path.toString().substring(commonPathPrefix.length());
        if (relativePath.isEmpty()) {
            // A file returned for "tableLocation/", might be possible on S3.
            continue;
        }
        allPathsChecked++;
        // TODO Note: Databricks can delete directories during vacuum on s3. This might need to be revisited.
        if (!fileStatus.isFile()) {
            nonFiles++;
            continue;
        }
        // ignore tableLocation/_delta_log/**
        if (relativePath.equals(TRANSACTION_LOG_DIRECTORY) || relativePath.startsWith(TRANSACTION_LOG_DIRECTORY + "/")) {
            log.debug("[%s] skipping a file inside transaction log dir: %s", queryId, path);
            transactionLogFiles++;
            continue;
        }
        // skip retained files
        if (retainedPaths.contains(relativePath)) {
            log.debug("[%s] retaining a known file: %s", queryId, path);
            retainedKnownFiles++;
            continue;
        }
        // ignore recently created files
        long modificationTime = fileStatus.getModificationTime();
        Instant modificationInstant = Instant.ofEpochMilli(modificationTime);
        if (!modificationInstant.isBefore(threshold)) {
            log.debug("[%s] retaining an unknown file %s with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
            retainedUnknownFiles++;
            continue;
        }
        log.debug("[%s] deleting file [%s] with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
        if (!fileSystem.delete(path, false)) {
            throw new TrinoException(GENERIC_INTERNAL_ERROR, "Failed to delete file: " + path);
        }
        removedFiles++;
    }
    log.info("[%s] finished vacuuming table %s [%s]: files checked: %s; non-files: %s; metadata files: %s; retained known files: %s; retained unknown files: %s; removed files: %s", queryId, tableName, tableLocation, allPathsChecked, nonFiles, transactionLogFiles, retainedKnownFiles, retainedUnknownFiles, removedFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) TransactionLogUtil.getTransactionLogDir(io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.getTransactionLogDir) MethodHandle(java.lang.invoke.MethodHandle) Provider(javax.inject.Provider) Comparator.naturalOrder(java.util.Comparator.naturalOrder) Logger(io.airlift.log.Logger) FileSystem(org.apache.hadoop.fs.FileSystem) DeltaLakeMetadataFactory(io.trino.plugin.deltalake.DeltaLakeMetadataFactory) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) MethodHandleUtil.methodHandle(io.trino.spi.block.MethodHandleUtil.methodHandle) DeltaLakeSessionProperties.getVacuumMinRetention(io.trino.plugin.deltalake.DeltaLakeSessionProperties.getVacuumMinRetention) Duration(io.airlift.units.Duration) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) DeltaLakeTransactionLogEntry(io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry) TransactionLogAccess(io.trino.plugin.deltalake.transactionlog.TransactionLogAccess) Inject(javax.inject.Inject) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) ImmutableList(com.google.common.collect.ImmutableList) Procedure(io.trino.spi.procedure.Procedure) DeltaLakeMetadata(io.trino.plugin.deltalake.DeltaLakeMetadata) Objects.requireNonNull(java.util.Objects.requireNonNull) Path(org.apache.hadoop.fs.Path) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle) Procedures.checkProcedureArgument(io.trino.plugin.deltalake.procedure.Procedures.checkProcedureArgument) Argument(io.trino.spi.procedure.Procedure.Argument) TRANSACTION_LOG_DIRECTORY(io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) IOException(java.io.IOException) ConnectorSession(io.trino.spi.connector.ConnectorSession) Instant(java.time.Instant) CatalogName(io.trino.plugin.base.CatalogName) ThreadContextClassLoader(io.trino.spi.classloader.ThreadContextClassLoader) SchemaTableName(io.trino.spi.connector.SchemaTableName) GENERIC_INTERNAL_ERROR(io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) DeltaLakeConfig(io.trino.plugin.deltalake.DeltaLakeConfig) DeltaLakeSessionProperties(io.trino.plugin.deltalake.DeltaLakeSessionProperties) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Instant(java.time.Instant) DeltaLakeMetadata(io.trino.plugin.deltalake.DeltaLakeMetadata) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Duration(io.airlift.units.Duration) SchemaTableName(io.trino.spi.connector.SchemaTableName) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) FileSystem(org.apache.hadoop.fs.FileSystem) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Objects(java.util.Objects) TrinoException(io.trino.spi.TrinoException) DeltaLakeTableHandle(io.trino.plugin.deltalake.DeltaLakeTableHandle)

Example 5 with RemoveFileEntry

use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.

the class CheckpointEntryIterator method buildRemoveEntry.

private DeltaLakeTransactionLogEntry buildRemoveEntry(ConnectorSession session, Block block, int pagePosition) {
    log.debug("Building remove entry from %s pagePosition %d", block, pagePosition);
    if (block.isNull(pagePosition)) {
        return null;
    }
    int removeFields = 3;
    Block removeEntryBlock = block.getObject(pagePosition, Block.class);
    log.debug("Block %s has %s fields", block, removeEntryBlock.getPositionCount());
    if (removeEntryBlock.getPositionCount() != removeFields) {
        throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, format("Expected block %s to have %d children, but found %s", block, removeFields, removeEntryBlock.getPositionCount()));
    }
    RemoveFileEntry result = new RemoveFileEntry(getString(removeEntryBlock, 0), getLong(removeEntryBlock, 1), getByte(removeEntryBlock, 2) != 0);
    log.debug("Result: %s", result);
    return DeltaLakeTransactionLogEntry.removeFileEntry(result);
}
Also used : Block(io.trino.spi.block.Block) TrinoException(io.trino.spi.TrinoException) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry)

Aggregations

RemoveFileEntry (io.trino.plugin.deltalake.transactionlog.RemoveFileEntry)8 AddFileEntry (io.trino.plugin.deltalake.transactionlog.AddFileEntry)6 MetadataEntry (io.trino.plugin.deltalake.transactionlog.MetadataEntry)4 ProtocolEntry (io.trino.plugin.deltalake.transactionlog.ProtocolEntry)4 TrinoException (io.trino.spi.TrinoException)4 SchemaTableName (io.trino.spi.connector.SchemaTableName)3 Path (org.apache.hadoop.fs.Path)3 JsonProcessingException (com.fasterxml.jackson.core.JsonProcessingException)2 Preconditions.checkState (com.google.common.base.Preconditions.checkState)2 ImmutableList (com.google.common.collect.ImmutableList)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)2 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)2 Logger (io.airlift.log.Logger)2 Slice (io.airlift.slice.Slice)2 NotADeltaLakeTableException (io.trino.plugin.deltalake.metastore.NotADeltaLakeTableException)2 CommitInfoEntry (io.trino.plugin.deltalake.transactionlog.CommitInfoEntry)2 DeltaLakeTransactionLogEntry (io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry)2 TransactionEntry (io.trino.plugin.deltalake.transactionlog.TransactionEntry)2 TransactionConflictException (io.trino.plugin.deltalake.transactionlog.writer.TransactionConflictException)2 TransactionLogWriter (io.trino.plugin.deltalake.transactionlog.writer.TransactionLogWriter)2