use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.
the class TestCheckpointEntryIterator method testReadAllEntries.
@Test
public void testReadAllEntries() throws Exception {
URI checkpointUri = getResource(TEST_CHECKPOINT).toURI();
MetadataEntry metadataEntry = readMetadataEntry(checkpointUri);
CheckpointEntryIterator checkpointEntryIterator = createCheckpointEntryIterator(checkpointUri, ImmutableSet.of(METADATA, PROTOCOL, TRANSACTION, ADD, REMOVE, COMMIT), Optional.of(readMetadataEntry(checkpointUri)));
List<DeltaLakeTransactionLogEntry> entries = ImmutableList.copyOf(checkpointEntryIterator);
assertThat(entries).hasSize(17);
// MetadataEntry
assertThat(entries).element(12).extracting(DeltaLakeTransactionLogEntry::getMetaData).isEqualTo(metadataEntry);
// ProtocolEntry
assertThat(entries).element(11).extracting(DeltaLakeTransactionLogEntry::getProtocol).isEqualTo(new ProtocolEntry(1, 2));
// TransactionEntry
// not found in the checkpoint, TODO add a test
assertThat(entries).map(DeltaLakeTransactionLogEntry::getTxn).filteredOn(Objects::nonNull).isEmpty();
// AddFileEntry
assertThat(entries).element(8).extracting(DeltaLakeTransactionLogEntry::getAdd).isEqualTo(new AddFileEntry("age=42/part-00003-0f53cae3-3e34-4876-b651-e1db9584dbc3.c000.snappy.parquet", Map.of("age", "42"), 2634, 1579190165000L, false, Optional.of("{" + "\"numRecords\":1," + "\"minValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"maxValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"nullCount\":{\"name\":0,\"married\":0,\"phones\":0,\"address\":{\"street\":0,\"city\":0,\"state\":0,\"zip\":0},\"income\":0}" + "}"), Optional.empty(), null));
// RemoveFileEntry
assertThat(entries).element(3).extracting(DeltaLakeTransactionLogEntry::getRemove).isEqualTo(new RemoveFileEntry("age=42/part-00000-951068bd-bcf4-4094-bb94-536f3c41d31f.c000.snappy.parquet", 1579190155406L, false));
// CommitInfoEntry
// not found in the checkpoint, TODO add a test
assertThat(entries).map(DeltaLakeTransactionLogEntry::getCommitInfo).filteredOn(Objects::nonNull).isEmpty();
}
use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.
the class TestCheckpointBuilder method testCheckpointBuilder.
@Test
public void testCheckpointBuilder() {
CheckpointBuilder builder = new CheckpointBuilder();
MetadataEntry metadata1 = new MetadataEntry("1", "", "", new MetadataEntry.Format("", Map.of()), "", List.of(), Map.of(), 1);
MetadataEntry metadata2 = new MetadataEntry("2", "", "", new MetadataEntry.Format("", Map.of()), "", List.of(), Map.of(), 1);
builder.addLogEntry(metadataEntry(metadata1));
builder.addLogEntry(metadataEntry(metadata2));
ProtocolEntry protocol1 = new ProtocolEntry(1, 2);
ProtocolEntry protocol2 = new ProtocolEntry(3, 4);
builder.addLogEntry(protocolEntry(protocol1));
builder.addLogEntry(protocolEntry(protocol2));
TransactionEntry app1TransactionV1 = new TransactionEntry("app1", 1, 1);
TransactionEntry app1TransactionV2 = new TransactionEntry("app1", 2, 2);
TransactionEntry app1TransactionV3 = new TransactionEntry("app1", 3, 3);
TransactionEntry app2TransactionV5 = new TransactionEntry("app2", 5, 5);
builder.addLogEntry(transactionEntry(app1TransactionV2));
builder.addLogEntry(transactionEntry(app1TransactionV3));
builder.addLogEntry(transactionEntry(app1TransactionV1));
builder.addLogEntry(transactionEntry(app2TransactionV5));
AddFileEntry addA1 = new AddFileEntry("a", Map.of(), 1, 1, true, Optional.empty(), Optional.empty(), Map.of());
RemoveFileEntry removeA1 = new RemoveFileEntry("a", 1, true);
AddFileEntry addA2 = new AddFileEntry("a", Map.of(), 2, 1, true, Optional.empty(), Optional.empty(), Map.of());
AddFileEntry addB = new AddFileEntry("b", Map.of(), 1, 1, true, Optional.empty(), Optional.empty(), Map.of());
RemoveFileEntry removeB = new RemoveFileEntry("b", 1, true);
RemoveFileEntry removeC = new RemoveFileEntry("c", 1, true);
builder.addLogEntry(addFileEntry(addA1));
builder.addLogEntry(removeFileEntry(removeA1));
builder.addLogEntry(addFileEntry(addA2));
builder.addLogEntry(addFileEntry(addB));
builder.addLogEntry(removeFileEntry(removeB));
builder.addLogEntry(removeFileEntry(removeC));
CheckpointEntries expectedCheckpoint = new CheckpointEntries(metadata2, protocol2, Set.of(app1TransactionV3, app2TransactionV5), Set.of(addA2), Set.of(removeB, removeC));
assertEquals(expectedCheckpoint, builder.build());
}
use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.
the class CheckpointWriter method write.
public void write(ConnectorSession session, CheckpointEntries entries, Path targetPath) {
RowType metadataEntryType = checkpointSchemaManager.getMetadataEntryType();
RowType protocolEntryType = checkpointSchemaManager.getProtocolEntryType();
RowType txnEntryType = checkpointSchemaManager.getTxnEntryType();
RowType addEntryType = checkpointSchemaManager.getAddEntryType(entries.getMetadataEntry());
RowType removeEntryType = checkpointSchemaManager.getRemoveEntryType();
List<String> columnNames = ImmutableList.of("metaData", "protocol", "txn", "add", "remove");
List<Type> columnTypes = ImmutableList.of(metadataEntryType, protocolEntryType, txnEntryType, addEntryType, removeEntryType);
Properties schema = buildSchemaProperties(columnNames, columnTypes);
Configuration conf = hdfsEnvironment.getConfiguration(new HdfsEnvironment.HdfsContext(session), targetPath);
configureCompression(conf, SNAPPY);
JobConf jobConf = toJobConf(conf);
RecordFileWriter writer = new RecordFileWriter(targetPath, columnNames, fromHiveStorageFormat(PARQUET), schema, PARQUET.getEstimatedWriterMemoryUsage(), jobConf, typeManager, DateTimeZone.UTC, session);
PageBuilder pageBuilder = new PageBuilder(columnTypes);
writeMetadataEntry(pageBuilder, metadataEntryType, entries.getMetadataEntry());
writeProtocolEntry(pageBuilder, protocolEntryType, entries.getProtocolEntry());
for (TransactionEntry transactionEntry : entries.getTransactionEntries()) {
writeTransactionEntry(pageBuilder, txnEntryType, transactionEntry);
}
for (AddFileEntry addFileEntry : entries.getAddFileEntries()) {
writeAddFileEntry(pageBuilder, addEntryType, addFileEntry);
}
for (RemoveFileEntry removeFileEntry : entries.getRemoveFileEntries()) {
writeRemoveFileEntry(pageBuilder, removeEntryType, removeFileEntry);
}
// Not writing commit infos for now. DB does not keep them in the checkpoints by default
writer.appendRows(pageBuilder.build());
writer.commit();
}
use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.
the class VacuumProcedure method doVacuum.
private void doVacuum(ConnectorSession session, String schema, String table, String retention) throws IOException {
checkProcedureArgument(schema != null, "schema_name cannot be null");
checkProcedureArgument(!schema.isEmpty(), "schema_name cannot be empty");
checkProcedureArgument(table != null, "table_name cannot be null");
checkProcedureArgument(!table.isEmpty(), "table_name cannot be empty");
checkProcedureArgument(retention != null, "retention cannot be null");
Duration retentionDuration = Duration.valueOf(retention);
Duration minRetention = getVacuumMinRetention(session);
checkProcedureArgument(retentionDuration.compareTo(minRetention) >= 0, "Retention specified (%s) is shorter than the minimum retention configured in the system (%s). " + "Minimum retention can be changed with %s configuration property or %s.%s session property", retentionDuration, minRetention, DeltaLakeConfig.VACUUM_MIN_RETENTION, catalogName, DeltaLakeSessionProperties.VACUUM_MIN_RETENTION);
Instant threshold = Instant.now().minusMillis(retentionDuration.toMillis());
DeltaLakeMetadata metadata = metadataFactory.create(session.getIdentity());
SchemaTableName tableName = new SchemaTableName(schema, table);
DeltaLakeTableHandle handle = metadata.getTableHandle(session, tableName);
checkProcedureArgument(handle != null, "Table '%s' does not exist", tableName);
TableSnapshot tableSnapshot = transactionLogAccess.loadSnapshot(tableName, new Path(handle.getLocation()), session);
Path tableLocation = tableSnapshot.getTableLocation();
Path transactionLogDir = getTransactionLogDir(tableLocation);
FileSystem fileSystem = hdfsEnvironment.getFileSystem(new HdfsEnvironment.HdfsContext(session), tableLocation);
String commonPathPrefix = tableLocation + "/";
String queryId = session.getQueryId();
// Retain all active files and every file removed by a "recent" transaction (except for the oldest "recent").
// Any remaining file are not live, and not needed to read any "recent" snapshot.
List<Long> recentVersions = transactionLogAccess.getPastTableVersions(fileSystem, transactionLogDir, threshold, tableSnapshot.getVersion());
Set<String> retainedPaths = Stream.concat(transactionLogAccess.getActiveFiles(tableSnapshot, session).stream().map(AddFileEntry::getPath), transactionLogAccess.getJsonEntries(fileSystem, transactionLogDir, // active files, but still needed to read a "recent" snapshot
recentVersions.stream().sorted(naturalOrder()).skip(1).collect(toImmutableList())).map(DeltaLakeTransactionLogEntry::getRemove).filter(Objects::nonNull).map(RemoveFileEntry::getPath)).peek(path -> checkState(!path.startsWith(tableLocation.toString()), "Unexpected absolute path in transaction log: %s", path)).collect(toImmutableSet());
log.debug("[%s] attempting to vacuum table %s [%s] with %s retention (expiry threshold %s). %s data file paths marked for retention", queryId, tableName, tableLocation, retention, threshold, retainedPaths.size());
long nonFiles = 0;
long allPathsChecked = 0;
long transactionLogFiles = 0;
long retainedKnownFiles = 0;
long retainedUnknownFiles = 0;
long removedFiles = 0;
RemoteIterator<LocatedFileStatus> listing = fileSystem.listFiles(tableLocation, true);
while (listing.hasNext()) {
LocatedFileStatus fileStatus = listing.next();
Path path = fileStatus.getPath();
checkState(path.toString().startsWith(commonPathPrefix), "Unexpected path [%s] returned when listing files under [%s]", path, tableLocation);
String relativePath = path.toString().substring(commonPathPrefix.length());
if (relativePath.isEmpty()) {
// A file returned for "tableLocation/", might be possible on S3.
continue;
}
allPathsChecked++;
// TODO Note: Databricks can delete directories during vacuum on s3. This might need to be revisited.
if (!fileStatus.isFile()) {
nonFiles++;
continue;
}
// ignore tableLocation/_delta_log/**
if (relativePath.equals(TRANSACTION_LOG_DIRECTORY) || relativePath.startsWith(TRANSACTION_LOG_DIRECTORY + "/")) {
log.debug("[%s] skipping a file inside transaction log dir: %s", queryId, path);
transactionLogFiles++;
continue;
}
// skip retained files
if (retainedPaths.contains(relativePath)) {
log.debug("[%s] retaining a known file: %s", queryId, path);
retainedKnownFiles++;
continue;
}
// ignore recently created files
long modificationTime = fileStatus.getModificationTime();
Instant modificationInstant = Instant.ofEpochMilli(modificationTime);
if (!modificationInstant.isBefore(threshold)) {
log.debug("[%s] retaining an unknown file %s with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
retainedUnknownFiles++;
continue;
}
log.debug("[%s] deleting file [%s] with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
if (!fileSystem.delete(path, false)) {
throw new TrinoException(GENERIC_INTERNAL_ERROR, "Failed to delete file: " + path);
}
removedFiles++;
}
log.info("[%s] finished vacuuming table %s [%s]: files checked: %s; non-files: %s; metadata files: %s; retained known files: %s; retained unknown files: %s; removed files: %s", queryId, tableName, tableLocation, allPathsChecked, nonFiles, transactionLogFiles, retainedKnownFiles, retainedUnknownFiles, removedFiles);
}
use of io.trino.plugin.deltalake.transactionlog.RemoveFileEntry in project trino by trinodb.
the class CheckpointEntryIterator method buildRemoveEntry.
private DeltaLakeTransactionLogEntry buildRemoveEntry(ConnectorSession session, Block block, int pagePosition) {
log.debug("Building remove entry from %s pagePosition %d", block, pagePosition);
if (block.isNull(pagePosition)) {
return null;
}
int removeFields = 3;
Block removeEntryBlock = block.getObject(pagePosition, Block.class);
log.debug("Block %s has %s fields", block, removeEntryBlock.getPositionCount());
if (removeEntryBlock.getPositionCount() != removeFields) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, format("Expected block %s to have %d children, but found %s", block, removeFields, removeEntryBlock.getPositionCount()));
}
RemoveFileEntry result = new RemoveFileEntry(getString(removeEntryBlock, 0), getLong(removeEntryBlock, 1), getByte(removeEntryBlock, 2) != 0);
log.debug("Result: %s", result);
return DeltaLakeTransactionLogEntry.removeFileEntry(result);
}
Aggregations