use of io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY in project trino by trinodb.
the class TestTransactionLogAccess method testSnapshotsAreConsistent.
@Test
public void testSnapshotsAreConsistent() throws Exception {
String tableName = "person";
File tempDir = Files.createTempDir();
File tableDir = new File(tempDir, tableName);
File transactionLogDir = new File(tableDir, TRANSACTION_LOG_DIRECTORY);
transactionLogDir.mkdirs();
File resourceDir = new File(getClass().getClassLoader().getResource("databricks/person/_delta_log").toURI());
copyTransactionLogEntry(0, 12, resourceDir, transactionLogDir);
Files.copy(new File(resourceDir, LAST_CHECKPOINT_FILENAME), new File(transactionLogDir, LAST_CHECKPOINT_FILENAME));
setupTransactionLogAccess(tableName, new Path(tableDir.toURI()));
List<AddFileEntry> expectedDataFiles = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
copyTransactionLogEntry(12, 14, resourceDir, transactionLogDir);
Set<String> newDataFiles = ImmutableSet.of("age=28/part-00000-40dd1707-1d42-4328-a59a-21f5c945fe60.c000.snappy.parquet", "age=29/part-00000-3794c463-cb0c-4beb-8d07-7cc1e3b5920f.c000.snappy.parquet");
TableSnapshot updatedTableSnapshot = transactionLogAccess.loadSnapshot(new SchemaTableName("schema", tableName), new Path(tableDir.toURI()), SESSION);
List<AddFileEntry> allDataFiles = transactionLogAccess.getActiveFiles(updatedTableSnapshot, SESSION);
List<AddFileEntry> dataFilesWithFixedVersion = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
for (String newFilePath : newDataFiles) {
assertTrue(allDataFiles.stream().anyMatch(entry -> entry.getPath().equals(newFilePath)));
assertTrue(dataFilesWithFixedVersion.stream().noneMatch(entry -> entry.getPath().equals(newFilePath)));
}
assertEquals(expectedDataFiles.size(), dataFilesWithFixedVersion.size());
List<ColumnMetadata> columns = extractSchema(transactionLogAccess.getMetadataEntry(tableSnapshot, SESSION).get(), TESTING_TYPE_MANAGER);
for (int i = 0; i < expectedDataFiles.size(); i++) {
AddFileEntry expected = expectedDataFiles.get(i);
AddFileEntry actual = dataFilesWithFixedVersion.get(i);
assertEquals(expected.getPath(), actual.getPath());
assertEquals(expected.getPartitionValues(), actual.getPartitionValues());
assertEquals(expected.getSize(), actual.getSize());
assertEquals(expected.getModificationTime(), actual.getModificationTime());
assertEquals(expected.isDataChange(), actual.isDataChange());
assertEquals(expected.getTags(), actual.getTags());
assertTrue(expected.getStats().isPresent());
assertTrue(actual.getStats().isPresent());
for (ColumnMetadata column : columns) {
DeltaLakeColumnHandle columnHandle = new DeltaLakeColumnHandle(column.getName(), column.getType(), REGULAR);
assertEquals(expected.getStats().get().getMinColumnValue(columnHandle), actual.getStats().get().getMinColumnValue(columnHandle));
assertEquals(expected.getStats().get().getMaxColumnValue(columnHandle), actual.getStats().get().getMaxColumnValue(columnHandle));
assertEquals(expected.getStats().get().getNullCount(columnHandle.getName()), actual.getStats().get().getNullCount(columnHandle.getName()));
assertEquals(expected.getStats().get().getNumRecords(), actual.getStats().get().getNumRecords());
}
}
}
use of io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY in project trino by trinodb.
the class VacuumProcedure method doVacuum.
private void doVacuum(ConnectorSession session, String schema, String table, String retention) throws IOException {
checkProcedureArgument(schema != null, "schema_name cannot be null");
checkProcedureArgument(!schema.isEmpty(), "schema_name cannot be empty");
checkProcedureArgument(table != null, "table_name cannot be null");
checkProcedureArgument(!table.isEmpty(), "table_name cannot be empty");
checkProcedureArgument(retention != null, "retention cannot be null");
Duration retentionDuration = Duration.valueOf(retention);
Duration minRetention = getVacuumMinRetention(session);
checkProcedureArgument(retentionDuration.compareTo(minRetention) >= 0, "Retention specified (%s) is shorter than the minimum retention configured in the system (%s). " + "Minimum retention can be changed with %s configuration property or %s.%s session property", retentionDuration, minRetention, DeltaLakeConfig.VACUUM_MIN_RETENTION, catalogName, DeltaLakeSessionProperties.VACUUM_MIN_RETENTION);
Instant threshold = Instant.now().minusMillis(retentionDuration.toMillis());
DeltaLakeMetadata metadata = metadataFactory.create(session.getIdentity());
SchemaTableName tableName = new SchemaTableName(schema, table);
DeltaLakeTableHandle handle = metadata.getTableHandle(session, tableName);
checkProcedureArgument(handle != null, "Table '%s' does not exist", tableName);
TableSnapshot tableSnapshot = transactionLogAccess.loadSnapshot(tableName, new Path(handle.getLocation()), session);
Path tableLocation = tableSnapshot.getTableLocation();
Path transactionLogDir = getTransactionLogDir(tableLocation);
FileSystem fileSystem = hdfsEnvironment.getFileSystem(new HdfsEnvironment.HdfsContext(session), tableLocation);
String commonPathPrefix = tableLocation + "/";
String queryId = session.getQueryId();
// Retain all active files and every file removed by a "recent" transaction (except for the oldest "recent").
// Any remaining file are not live, and not needed to read any "recent" snapshot.
List<Long> recentVersions = transactionLogAccess.getPastTableVersions(fileSystem, transactionLogDir, threshold, tableSnapshot.getVersion());
Set<String> retainedPaths = Stream.concat(transactionLogAccess.getActiveFiles(tableSnapshot, session).stream().map(AddFileEntry::getPath), transactionLogAccess.getJsonEntries(fileSystem, transactionLogDir, // active files, but still needed to read a "recent" snapshot
recentVersions.stream().sorted(naturalOrder()).skip(1).collect(toImmutableList())).map(DeltaLakeTransactionLogEntry::getRemove).filter(Objects::nonNull).map(RemoveFileEntry::getPath)).peek(path -> checkState(!path.startsWith(tableLocation.toString()), "Unexpected absolute path in transaction log: %s", path)).collect(toImmutableSet());
log.debug("[%s] attempting to vacuum table %s [%s] with %s retention (expiry threshold %s). %s data file paths marked for retention", queryId, tableName, tableLocation, retention, threshold, retainedPaths.size());
long nonFiles = 0;
long allPathsChecked = 0;
long transactionLogFiles = 0;
long retainedKnownFiles = 0;
long retainedUnknownFiles = 0;
long removedFiles = 0;
RemoteIterator<LocatedFileStatus> listing = fileSystem.listFiles(tableLocation, true);
while (listing.hasNext()) {
LocatedFileStatus fileStatus = listing.next();
Path path = fileStatus.getPath();
checkState(path.toString().startsWith(commonPathPrefix), "Unexpected path [%s] returned when listing files under [%s]", path, tableLocation);
String relativePath = path.toString().substring(commonPathPrefix.length());
if (relativePath.isEmpty()) {
// A file returned for "tableLocation/", might be possible on S3.
continue;
}
allPathsChecked++;
// TODO Note: Databricks can delete directories during vacuum on s3. This might need to be revisited.
if (!fileStatus.isFile()) {
nonFiles++;
continue;
}
// ignore tableLocation/_delta_log/**
if (relativePath.equals(TRANSACTION_LOG_DIRECTORY) || relativePath.startsWith(TRANSACTION_LOG_DIRECTORY + "/")) {
log.debug("[%s] skipping a file inside transaction log dir: %s", queryId, path);
transactionLogFiles++;
continue;
}
// skip retained files
if (retainedPaths.contains(relativePath)) {
log.debug("[%s] retaining a known file: %s", queryId, path);
retainedKnownFiles++;
continue;
}
// ignore recently created files
long modificationTime = fileStatus.getModificationTime();
Instant modificationInstant = Instant.ofEpochMilli(modificationTime);
if (!modificationInstant.isBefore(threshold)) {
log.debug("[%s] retaining an unknown file %s with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
retainedUnknownFiles++;
continue;
}
log.debug("[%s] deleting file [%s] with modification time %s (%s)", queryId, path, modificationTime, modificationInstant);
if (!fileSystem.delete(path, false)) {
throw new TrinoException(GENERIC_INTERNAL_ERROR, "Failed to delete file: " + path);
}
removedFiles++;
}
log.info("[%s] finished vacuuming table %s [%s]: files checked: %s; non-files: %s; metadata files: %s; retained known files: %s; retained unknown files: %s; removed files: %s", queryId, tableName, tableLocation, allPathsChecked, nonFiles, transactionLogFiles, retainedKnownFiles, retainedUnknownFiles, removedFiles);
}
Aggregations