Search in sources :

Example 1 with AddFileEntry

use of io.trino.plugin.deltalake.transactionlog.AddFileEntry in project trino by trinodb.

the class TestCheckpointEntryIterator method testReadAllEntries.

@Test
public void testReadAllEntries() throws Exception {
    URI checkpointUri = getResource(TEST_CHECKPOINT).toURI();
    MetadataEntry metadataEntry = readMetadataEntry(checkpointUri);
    CheckpointEntryIterator checkpointEntryIterator = createCheckpointEntryIterator(checkpointUri, ImmutableSet.of(METADATA, PROTOCOL, TRANSACTION, ADD, REMOVE, COMMIT), Optional.of(readMetadataEntry(checkpointUri)));
    List<DeltaLakeTransactionLogEntry> entries = ImmutableList.copyOf(checkpointEntryIterator);
    assertThat(entries).hasSize(17);
    // MetadataEntry
    assertThat(entries).element(12).extracting(DeltaLakeTransactionLogEntry::getMetaData).isEqualTo(metadataEntry);
    // ProtocolEntry
    assertThat(entries).element(11).extracting(DeltaLakeTransactionLogEntry::getProtocol).isEqualTo(new ProtocolEntry(1, 2));
    // TransactionEntry
    // not found in the checkpoint, TODO add a test
    assertThat(entries).map(DeltaLakeTransactionLogEntry::getTxn).filteredOn(Objects::nonNull).isEmpty();
    // AddFileEntry
    assertThat(entries).element(8).extracting(DeltaLakeTransactionLogEntry::getAdd).isEqualTo(new AddFileEntry("age=42/part-00003-0f53cae3-3e34-4876-b651-e1db9584dbc3.c000.snappy.parquet", Map.of("age", "42"), 2634, 1579190165000L, false, Optional.of("{" + "\"numRecords\":1," + "\"minValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"maxValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"nullCount\":{\"name\":0,\"married\":0,\"phones\":0,\"address\":{\"street\":0,\"city\":0,\"state\":0,\"zip\":0},\"income\":0}" + "}"), Optional.empty(), null));
    // RemoveFileEntry
    assertThat(entries).element(3).extracting(DeltaLakeTransactionLogEntry::getRemove).isEqualTo(new RemoveFileEntry("age=42/part-00000-951068bd-bcf4-4094-bb94-536f3c41d31f.c000.snappy.parquet", 1579190155406L, false));
    // CommitInfoEntry
    // not found in the checkpoint, TODO add a test
    assertThat(entries).map(DeltaLakeTransactionLogEntry::getCommitInfo).filteredOn(Objects::nonNull).isEmpty();
}
Also used : ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) DeltaLakeTransactionLogEntry(io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) URI(java.net.URI) Test(org.testng.annotations.Test)

Example 2 with AddFileEntry

use of io.trino.plugin.deltalake.transactionlog.AddFileEntry in project trino by trinodb.

the class TestCheckpointBuilder method testCheckpointBuilder.

@Test
public void testCheckpointBuilder() {
    CheckpointBuilder builder = new CheckpointBuilder();
    MetadataEntry metadata1 = new MetadataEntry("1", "", "", new MetadataEntry.Format("", Map.of()), "", List.of(), Map.of(), 1);
    MetadataEntry metadata2 = new MetadataEntry("2", "", "", new MetadataEntry.Format("", Map.of()), "", List.of(), Map.of(), 1);
    builder.addLogEntry(metadataEntry(metadata1));
    builder.addLogEntry(metadataEntry(metadata2));
    ProtocolEntry protocol1 = new ProtocolEntry(1, 2);
    ProtocolEntry protocol2 = new ProtocolEntry(3, 4);
    builder.addLogEntry(protocolEntry(protocol1));
    builder.addLogEntry(protocolEntry(protocol2));
    TransactionEntry app1TransactionV1 = new TransactionEntry("app1", 1, 1);
    TransactionEntry app1TransactionV2 = new TransactionEntry("app1", 2, 2);
    TransactionEntry app1TransactionV3 = new TransactionEntry("app1", 3, 3);
    TransactionEntry app2TransactionV5 = new TransactionEntry("app2", 5, 5);
    builder.addLogEntry(transactionEntry(app1TransactionV2));
    builder.addLogEntry(transactionEntry(app1TransactionV3));
    builder.addLogEntry(transactionEntry(app1TransactionV1));
    builder.addLogEntry(transactionEntry(app2TransactionV5));
    AddFileEntry addA1 = new AddFileEntry("a", Map.of(), 1, 1, true, Optional.empty(), Optional.empty(), Map.of());
    RemoveFileEntry removeA1 = new RemoveFileEntry("a", 1, true);
    AddFileEntry addA2 = new AddFileEntry("a", Map.of(), 2, 1, true, Optional.empty(), Optional.empty(), Map.of());
    AddFileEntry addB = new AddFileEntry("b", Map.of(), 1, 1, true, Optional.empty(), Optional.empty(), Map.of());
    RemoveFileEntry removeB = new RemoveFileEntry("b", 1, true);
    RemoveFileEntry removeC = new RemoveFileEntry("c", 1, true);
    builder.addLogEntry(addFileEntry(addA1));
    builder.addLogEntry(removeFileEntry(removeA1));
    builder.addLogEntry(addFileEntry(addA2));
    builder.addLogEntry(addFileEntry(addB));
    builder.addLogEntry(removeFileEntry(removeB));
    builder.addLogEntry(removeFileEntry(removeC));
    CheckpointEntries expectedCheckpoint = new CheckpointEntries(metadata2, protocol2, Set.of(app1TransactionV3, app2TransactionV5), Set.of(addA2), Set.of(removeB, removeC));
    assertEquals(expectedCheckpoint, builder.build());
}
Also used : ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) TransactionEntry(io.trino.plugin.deltalake.transactionlog.TransactionEntry) Test(org.testng.annotations.Test)

Example 3 with AddFileEntry

use of io.trino.plugin.deltalake.transactionlog.AddFileEntry in project trino by trinodb.

the class TestCheckpointEntryIterator method testReadAddEntries.

@Test
public void testReadAddEntries() throws Exception {
    URI checkpointUri = getResource(TEST_CHECKPOINT).toURI();
    CheckpointEntryIterator checkpointEntryIterator = createCheckpointEntryIterator(checkpointUri, ImmutableSet.of(ADD), Optional.of(readMetadataEntry(checkpointUri)));
    List<DeltaLakeTransactionLogEntry> entries = ImmutableList.copyOf(checkpointEntryIterator);
    assertThat(entries).hasSize(9);
    assertThat(entries).element(3).extracting(DeltaLakeTransactionLogEntry::getAdd).isEqualTo(new AddFileEntry("age=42/part-00003-0f53cae3-3e34-4876-b651-e1db9584dbc3.c000.snappy.parquet", Map.of("age", "42"), 2634, 1579190165000L, false, Optional.of("{" + "\"numRecords\":1," + "\"minValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"maxValues\":{\"name\":\"Alice\",\"address\":{\"street\":\"100 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":111000.0}," + "\"nullCount\":{\"name\":0,\"married\":0,\"phones\":0,\"address\":{\"street\":0,\"city\":0,\"state\":0,\"zip\":0},\"income\":0}" + "}"), Optional.empty(), null));
    assertThat(entries).element(7).extracting(DeltaLakeTransactionLogEntry::getAdd).isEqualTo(new AddFileEntry("age=30/part-00002-5800be2e-2373-47d8-8b86-776a8ea9d69f.c000.snappy.parquet", Map.of("age", "30"), 2688, 1579190165000L, false, Optional.of("{" + "\"numRecords\":1," + "\"minValues\":{\"name\":\"Andy\",\"address\":{\"street\":\"101 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":81000.0}," + "\"maxValues\":{\"name\":\"Andy\",\"address\":{\"street\":\"101 Main St\",\"city\":\"Anytown\",\"state\":\"NY\",\"zip\":\"12345\"},\"income\":81000.0}," + "\"nullCount\":{\"name\":0,\"married\":0,\"phones\":0,\"address\":{\"street\":0,\"city\":0,\"state\":0,\"zip\":0},\"income\":0}" + "}"), Optional.empty(), null));
}
Also used : DeltaLakeTransactionLogEntry(io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) URI(java.net.URI) Test(org.testng.annotations.Test)

Example 4 with AddFileEntry

use of io.trino.plugin.deltalake.transactionlog.AddFileEntry in project trino by trinodb.

the class TestTransactionLogAccess method testUpdatingTailEntriesNoCheckpoint.

@Test
public void testUpdatingTailEntriesNoCheckpoint() throws Exception {
    String tableName = "person";
    File tempDir = Files.createTempDir();
    File tableDir = new File(tempDir, tableName);
    File transactionLogDir = new File(tableDir, TRANSACTION_LOG_DIRECTORY);
    transactionLogDir.mkdirs();
    File resourceDir = new File(getClass().getClassLoader().getResource("databricks/person/_delta_log").toURI());
    copyTransactionLogEntry(0, 7, resourceDir, transactionLogDir);
    setupTransactionLogAccess(tableName, new Path(tableDir.toURI()));
    List<AddFileEntry> activeDataFiles = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
    Set<String> dataFiles = ImmutableSet.of("age=42/part-00000-b82d8859-84a0-4f05-872c-206b07dd54f0.c000.snappy.parquet", "age=30/part-00000-72a56c23-01ba-483a-9062-dd0accc86599.c000.snappy.parquet", "age=25/part-00000-609e34b1-5466-4dbc-a780-2708166e7adb.c000.snappy.parquet", "age=30/part-00000-7e43a3c3-ea26-4ae7-8eac-8f60cbb4df03.c000.snappy.parquet", "age=21/part-00000-3d546786-bedc-407f-b9f7-e97aa12cce0f.c000.snappy.parquet", "age=21/part-00001-290f0f26-19cf-4772-821e-36d55d9b7872.c000.snappy.parquet");
    assertEqualsIgnoreOrder(activeDataFiles.stream().map(AddFileEntry::getPath).collect(Collectors.toSet()), dataFiles);
    copyTransactionLogEntry(7, 9, resourceDir, transactionLogDir);
    TableSnapshot updatedSnapshot = transactionLogAccess.loadSnapshot(new SchemaTableName("schema", tableName), new Path(tableDir.toURI()), SESSION);
    activeDataFiles = transactionLogAccess.getActiveFiles(updatedSnapshot, SESSION);
    dataFiles = ImmutableSet.of("age=21/part-00000-3d546786-bedc-407f-b9f7-e97aa12cce0f.c000.snappy.parquet", "age=21/part-00001-290f0f26-19cf-4772-821e-36d55d9b7872.c000.snappy.parquet", "age=30/part-00000-63c2205d-84a3-4a66-bd7c-f69f5af55bbc.c000.snappy.parquet", "age=25/part-00001-aceaf062-1cd1-45cb-8f83-277ffebe995c.c000.snappy.parquet", "age=30/part-00002-5800be2e-2373-47d8-8b86-776a8ea9d69f.c000.snappy.parquet", "age=42/part-00003-0f53cae3-3e34-4876-b651-e1db9584dbc3.c000.snappy.parquet", "age=25/part-00000-b7fbbe31-c7f9-44ed-8757-5c47d10c3e81.c000.snappy.parquet");
    assertEqualsIgnoreOrder(activeDataFiles.stream().map(AddFileEntry::getPath).collect(Collectors.toSet()), dataFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) File(java.io.File) SchemaTableName(io.trino.spi.connector.SchemaTableName) Test(org.testng.annotations.Test)

Example 5 with AddFileEntry

use of io.trino.plugin.deltalake.transactionlog.AddFileEntry in project trino by trinodb.

the class TestTransactionLogAccess method testSnapshotsAreConsistent.

@Test
public void testSnapshotsAreConsistent() throws Exception {
    String tableName = "person";
    File tempDir = Files.createTempDir();
    File tableDir = new File(tempDir, tableName);
    File transactionLogDir = new File(tableDir, TRANSACTION_LOG_DIRECTORY);
    transactionLogDir.mkdirs();
    File resourceDir = new File(getClass().getClassLoader().getResource("databricks/person/_delta_log").toURI());
    copyTransactionLogEntry(0, 12, resourceDir, transactionLogDir);
    Files.copy(new File(resourceDir, LAST_CHECKPOINT_FILENAME), new File(transactionLogDir, LAST_CHECKPOINT_FILENAME));
    setupTransactionLogAccess(tableName, new Path(tableDir.toURI()));
    List<AddFileEntry> expectedDataFiles = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
    copyTransactionLogEntry(12, 14, resourceDir, transactionLogDir);
    Set<String> newDataFiles = ImmutableSet.of("age=28/part-00000-40dd1707-1d42-4328-a59a-21f5c945fe60.c000.snappy.parquet", "age=29/part-00000-3794c463-cb0c-4beb-8d07-7cc1e3b5920f.c000.snappy.parquet");
    TableSnapshot updatedTableSnapshot = transactionLogAccess.loadSnapshot(new SchemaTableName("schema", tableName), new Path(tableDir.toURI()), SESSION);
    List<AddFileEntry> allDataFiles = transactionLogAccess.getActiveFiles(updatedTableSnapshot, SESSION);
    List<AddFileEntry> dataFilesWithFixedVersion = transactionLogAccess.getActiveFiles(tableSnapshot, SESSION);
    for (String newFilePath : newDataFiles) {
        assertTrue(allDataFiles.stream().anyMatch(entry -> entry.getPath().equals(newFilePath)));
        assertTrue(dataFilesWithFixedVersion.stream().noneMatch(entry -> entry.getPath().equals(newFilePath)));
    }
    assertEquals(expectedDataFiles.size(), dataFilesWithFixedVersion.size());
    List<ColumnMetadata> columns = extractSchema(transactionLogAccess.getMetadataEntry(tableSnapshot, SESSION).get(), TESTING_TYPE_MANAGER);
    for (int i = 0; i < expectedDataFiles.size(); i++) {
        AddFileEntry expected = expectedDataFiles.get(i);
        AddFileEntry actual = dataFilesWithFixedVersion.get(i);
        assertEquals(expected.getPath(), actual.getPath());
        assertEquals(expected.getPartitionValues(), actual.getPartitionValues());
        assertEquals(expected.getSize(), actual.getSize());
        assertEquals(expected.getModificationTime(), actual.getModificationTime());
        assertEquals(expected.isDataChange(), actual.isDataChange());
        assertEquals(expected.getTags(), actual.getTags());
        assertTrue(expected.getStats().isPresent());
        assertTrue(actual.getStats().isPresent());
        for (ColumnMetadata column : columns) {
            DeltaLakeColumnHandle columnHandle = new DeltaLakeColumnHandle(column.getName(), column.getType(), REGULAR);
            assertEquals(expected.getStats().get().getMinColumnValue(columnHandle), actual.getStats().get().getMinColumnValue(columnHandle));
            assertEquals(expected.getStats().get().getMaxColumnValue(columnHandle), actual.getStats().get().getMaxColumnValue(columnHandle));
            assertEquals(expected.getStats().get().getNullCount(columnHandle.getName()), actual.getStats().get().getNullCount(columnHandle.getName()));
            assertEquals(expected.getStats().get().getNumRecords(), actual.getStats().get().getNumRecords());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) Test(org.testng.annotations.Test) DeltaLakeSchemaSupport.extractSchema(io.trino.plugin.deltalake.transactionlog.DeltaLakeSchemaSupport.extractSchema) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) RemoveFileEntry(io.trino.plugin.deltalake.transactionlog.RemoveFileEntry) NoHdfsAuthentication(io.trino.plugin.hive.authentication.NoHdfsAuthentication) BigDecimal(java.math.BigDecimal) CheckpointSchemaManager(io.trino.plugin.deltalake.transactionlog.checkpoint.CheckpointSchemaManager) Map(java.util.Map) Sets.union(com.google.common.collect.Sets.union) Path(org.apache.hadoop.fs.Path) HiveHdfsConfiguration(io.trino.plugin.hive.HiveHdfsConfiguration) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) LAST_CHECKPOINT_FILENAME(io.trino.plugin.deltalake.transactionlog.TransactionLogParser.LAST_CHECKPOINT_FILENAME) Assert.assertFalse(org.testng.Assert.assertFalse) Assert.assertEquals(io.trino.testing.assertions.Assert.assertEquals) TRANSACTION_LOG_DIRECTORY(io.trino.plugin.deltalake.transactionlog.TransactionLogUtil.TRANSACTION_LOG_DIRECTORY) ImmutableSet(com.google.common.collect.ImmutableSet) FileFormatDataSourceStats(io.trino.plugin.hive.FileFormatDataSourceStats) HdfsEnvironment(io.trino.plugin.hive.HdfsEnvironment) ImmutableMap(com.google.common.collect.ImmutableMap) UTC_KEY(io.trino.spi.type.TimeZoneKey.UTC_KEY) SESSION(io.trino.testing.TestingConnectorSession.SESSION) MetadataEntry(io.trino.plugin.deltalake.transactionlog.MetadataEntry) Set(java.util.Set) Collectors(java.util.stream.Collectors) SchemaTableName(io.trino.spi.connector.SchemaTableName) String.format(java.lang.String.format) List(java.util.List) Stream(java.util.stream.Stream) HdfsConfig(io.trino.plugin.hive.HdfsConfig) LocalDate(java.time.LocalDate) HdfsConfigurationInitializer(io.trino.plugin.hive.HdfsConfigurationInitializer) Decimals(io.trino.spi.type.Decimals) UTC(java.time.ZoneOffset.UTC) Optional(java.util.Optional) ParquetReaderConfig(io.trino.plugin.hive.parquet.ParquetReaderConfig) REGULAR(io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR) DeltaLakeFileStatistics(io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics) DataProvider(org.testng.annotations.DataProvider) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) LocalDateTime(java.time.LocalDateTime) DateTimeEncoding(io.trino.spi.type.DateTimeEncoding) ImmutableList(com.google.common.collect.ImmutableList) Files(com.google.common.io.Files) IntegerType(io.trino.spi.type.IntegerType) TESTING_TYPE_MANAGER(io.trino.type.InternalTypeManager.TESTING_TYPE_MANAGER) IOException(java.io.IOException) HdfsConfiguration(io.trino.plugin.hive.HdfsConfiguration) TupleDomain(io.trino.spi.predicate.TupleDomain) TestingConnectorContext(io.trino.testing.TestingConnectorContext) File(java.io.File) Assertions.assertEqualsIgnoreOrder(io.airlift.testing.Assertions.assertEqualsIgnoreOrder) ProtocolEntry(io.trino.plugin.deltalake.transactionlog.ProtocolEntry) Assert.assertTrue(org.testng.Assert.assertTrue) CommitInfoEntry(io.trino.plugin.deltalake.transactionlog.CommitInfoEntry) TypeManager(io.trino.spi.type.TypeManager) ColumnMetadata(io.trino.spi.connector.ColumnMetadata) SchemaTableName(io.trino.spi.connector.SchemaTableName) AddFileEntry(io.trino.plugin.deltalake.transactionlog.AddFileEntry) TableSnapshot(io.trino.plugin.deltalake.transactionlog.TableSnapshot) File(java.io.File) Test(org.testng.annotations.Test)

Aggregations

AddFileEntry (io.trino.plugin.deltalake.transactionlog.AddFileEntry)33 Test (org.testng.annotations.Test)26 DeltaLakeFileStatistics (io.trino.plugin.deltalake.transactionlog.statistics.DeltaLakeFileStatistics)18 Path (org.apache.hadoop.fs.Path)9 MetadataEntry (io.trino.plugin.deltalake.transactionlog.MetadataEntry)6 ProtocolEntry (io.trino.plugin.deltalake.transactionlog.ProtocolEntry)6 RemoveFileEntry (io.trino.plugin.deltalake.transactionlog.RemoveFileEntry)6 TableSnapshot (io.trino.plugin.deltalake.transactionlog.TableSnapshot)6 SchemaTableName (io.trino.spi.connector.SchemaTableName)6 File (java.io.File)6 List (java.util.List)5 ImmutableList (com.google.common.collect.ImmutableList)4 Map (java.util.Map)4 Optional (java.util.Optional)4 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 ImmutableSet.toImmutableSet (com.google.common.collect.ImmutableSet.toImmutableSet)3 REGULAR (io.trino.plugin.deltalake.DeltaLakeColumnType.REGULAR)3 DeltaLakeTransactionLogEntry (io.trino.plugin.deltalake.transactionlog.DeltaLakeTransactionLogEntry)3 TransactionEntry (io.trino.plugin.deltalake.transactionlog.TransactionEntry)3 HdfsEnvironment (io.trino.plugin.hive.HdfsEnvironment)3