use of io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator in project trino by trinodb.
the class TestDeltaLakeFileStatistics method testParseParquetStatistics.
@Test
public void testParseParquetStatistics() throws Exception {
File statsFile = new File(getClass().getResource("/databricks/pruning/parquet_struct_statistics/_delta_log/00000000000000000010.checkpoint.parquet").getFile());
Path checkpointPath = new Path(statsFile.toURI());
TypeManager typeManager = TESTING_TYPE_MANAGER;
CheckpointSchemaManager checkpointSchemaManager = new CheckpointSchemaManager(typeManager);
HdfsConfig hdfsConfig = new HdfsConfig();
HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hdfsConfig), ImmutableSet.of());
HdfsEnvironment hdfsEnvironment = new HdfsEnvironment(hdfsConfiguration, hdfsConfig, new NoHdfsAuthentication());
FileSystem fs = hdfsEnvironment.getFileSystem(new HdfsEnvironment.HdfsContext(SESSION), checkpointPath);
CheckpointEntryIterator metadataEntryIterator = new CheckpointEntryIterator(checkpointPath, SESSION, fs.getFileStatus(checkpointPath).getLen(), checkpointSchemaManager, typeManager, ImmutableSet.of(METADATA), Optional.empty(), hdfsEnvironment, new FileFormatDataSourceStats(), new ParquetReaderConfig().toParquetReaderOptions(), true);
MetadataEntry metadataEntry = getOnlyElement(metadataEntryIterator).getMetaData();
CheckpointEntryIterator checkpointEntryIterator = new CheckpointEntryIterator(checkpointPath, SESSION, fs.getFileStatus(checkpointPath).getLen(), checkpointSchemaManager, typeManager, ImmutableSet.of(CheckpointEntryIterator.EntryType.ADD), Optional.of(metadataEntry), hdfsEnvironment, new FileFormatDataSourceStats(), new ParquetReaderConfig().toParquetReaderOptions(), true);
DeltaLakeTransactionLogEntry matchingAddFileEntry = null;
while (checkpointEntryIterator.hasNext()) {
DeltaLakeTransactionLogEntry entry = checkpointEntryIterator.next();
if (entry.getAdd() != null && entry.getAdd().getPath().contains("part-00000-17951bea-0d04-43c1-979c-ea1fac19b382-c000.snappy.parquet")) {
assertNull(matchingAddFileEntry);
matchingAddFileEntry = entry;
}
}
assertNotNull(matchingAddFileEntry);
assertThat(matchingAddFileEntry.getAdd().getStats()).isPresent();
testStatisticsValues(matchingAddFileEntry.getAdd().getStats().get());
}
use of io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointEntryIterator in project trino by trinodb.
the class TableSnapshot method getCheckpointTransactionLogEntries.
private Stream<DeltaLakeTransactionLogEntry> getCheckpointTransactionLogEntries(ConnectorSession session, Set<CheckpointEntryIterator.EntryType> entryTypes, Optional<MetadataEntry> metadataEntry, CheckpointSchemaManager checkpointSchemaManager, TypeManager typeManager, FileSystem fileSystem, HdfsEnvironment hdfsEnvironment, FileFormatDataSourceStats stats, LastCheckpoint checkpoint, Path checkpointPath) throws IOException {
FileStatus fileStatus;
try {
fileStatus = fileSystem.getFileStatus(checkpointPath);
} catch (FileNotFoundException e) {
throw new TrinoException(DELTA_LAKE_INVALID_SCHEMA, format("%s mentions a non-existent checkpoint file for table: %s", checkpoint, table));
}
Iterator<DeltaLakeTransactionLogEntry> checkpointEntryIterator = new CheckpointEntryIterator(checkpointPath, session, fileStatus.getLen(), checkpointSchemaManager, typeManager, entryTypes, metadataEntry, hdfsEnvironment, stats, parquetReaderOptions, checkpointRowStatisticsWritingEnabled);
return stream(checkpointEntryIterator);
}
Aggregations