Search in sources :

Example 86 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestCleaner method insertFirstBigBatchForClientCleanerTest.

/**
 * Helper method to do first batch of insert for clean by versions/commits tests.
 *
 * @param cfg Hoodie Write Config
 * @param client Hoodie Client
 * @param recordGenFunction Function to generate records for insertion
 * @param insertFn Insertion API for testing
 * @throws Exception in case of error
 */
private Pair<String, JavaRDD<WriteStatus>> insertFirstBigBatchForClientCleanerTest(HoodieWriteConfig cfg, SparkRDDWriteClient client, Function2<List<HoodieRecord>, String, Integer> recordGenFunction, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, HoodieCleaningPolicy cleaningPolicy) throws Exception {
    /*
     * do a big insert (this is basically same as insert part of upsert, just adding it here so we can catch breakages
     * in insert(), if the implementation diverges.)
     */
    String newCommitTime = client.startCommit();
    List<HoodieRecord> records = recordGenFunction.apply(newCommitTime, BIG_BATCH_INSERT_SIZE);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 5);
    JavaRDD<WriteStatus> statuses = insertFn.apply(client, writeRecords, newCommitTime);
    // Verify there are no errors
    assertNoWriteErrors(statuses.collect());
    // verify that there is a commit
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
    assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
    // Should have 100 records in table (check using Index), all in locations marked at commit
    HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient);
    if (client.getConfig().shouldAutoCommit()) {
        assertFalse(table.getCompletedCommitsTimeline().empty());
    }
    // We no longer write empty cleaner plans when there is nothing to be cleaned.
    assertTrue(table.getCompletedCleanTimeline().empty());
    if (client.getConfig().shouldAutoCommit()) {
        HoodieIndex index = SparkHoodieIndexFactory.createIndex(cfg);
        List<HoodieRecord> taggedRecords = tagLocation(index, jsc.parallelize(records, 1), table).collect();
        checkTaggedRecords(taggedRecords, newCommitTime);
    }
    return Pair.of(newCommitTime, statuses);
}
Also used : HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 87 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestUpsertPartitioner method testAverageBytesPerRecordForNonEmptyCommitTimeLine.

@Test
public void testAverageBytesPerRecordForNonEmptyCommitTimeLine() throws Exception {
    HoodieTimeline commitTimeLine = mock(HoodieTimeline.class);
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1000).build()).build();
    when(commitTimeLine.empty()).thenReturn(false);
    when(commitTimeLine.getReverseOrderedInstants()).thenReturn(setupHoodieInstants().stream());
    LinkedList<Option<byte[]>> commits = generateCommitMetadataList();
    when(commitTimeLine.getInstantDetails(any(HoodieInstant.class))).thenAnswer(invocationOnMock -> commits.pop());
    long expectAvgSize = (long) Math.ceil((1.0 * 7500) / 1500);
    long actualAvgSize = averageBytesPerRecord(commitTimeLine, config);
    assertEquals(expectAvgSize, actualAvgSize);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Option(org.apache.hudi.common.util.Option) Test(org.junit.jupiter.api.Test)

Example 88 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieMergeHandle method testUpsertsForMultipleRecordsInSameFile.

@ParameterizedTest
@MethodSource("testArguments")
public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
    // Create records in a single partition
    String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
    dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
    // Build a common config with diff configs
    Properties properties = new Properties();
    properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
    properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
    // Build a write config with bulkinsertparallelism set
    HoodieWriteConfig cfg = getConfigBuilder().withProperties(properties).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        FileSystem fs = FSUtils.getFs(basePath, hadoopConf);
        /**
         * Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times
         * each. id1 (21 records), id2 (21 records), id3, id4
         */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 4);
        HoodieRecord record1 = records.get(0);
        HoodieRecord record2 = records.get(1);
        for (int i = 0; i < 20; i++) {
            HoodieRecord dup = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
            records.add(dup);
        }
        for (int i = 0; i < 20; i++) {
            HoodieRecord dup = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
            records.add(dup);
        }
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
        List<WriteStatus> statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        // verify that there is a commit
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 001");
        assertEquals(records.size(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain 44 records");
        /**
         * Write 2 (insert) This will do a bulk insert of 1 record with the same row_key as record1 in the previous insert
         * - id1. At this point, we will have 2 files with the row_keys as shown here - File 1 - id1 (21 records), id2 (21
         * records), id3, id4 File 2 - id1
         */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        // Do 1 more bulk insert with the same dup record1
        List<HoodieRecord> newRecords = new ArrayList<>();
        HoodieRecord sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
        newRecords.add(sameAsRecord1);
        writeRecords = jsc.parallelize(newRecords, 1);
        statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        // verify that there are 2 commits
        metaClient = HoodieTableMetaClient.reload(metaClient);
        timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(2, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting two commits.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 002");
        Dataset<Row> dataSet = getRecords();
        assertEquals(45, dataSet.count(), "Must contain 45 records");
        /**
         * Write 3 (insert) This will bulk insert 2 new completely new records. At this point, we will have 2 files with
         * the row_keys as shown here - File 1 - id1 (21 records), id2 (21 records), id3, id4 File 2 - id1 File 3 - id5,
         * id6
         */
        newCommitTime = "003";
        client.startCommitWithTime(newCommitTime);
        newRecords = dataGen.generateInserts(newCommitTime, 2);
        writeRecords = jsc.parallelize(newRecords, 1);
        statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        // verify that there are now 3 commits
        metaClient = HoodieTableMetaClient.reload(metaClient);
        timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(3, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting three commits.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 003");
        dataSet = getRecords();
        assertEquals(47, dataSet.count(), "Must contain 47 records");
        /**
         * Write 4 (updates) This will generate 2 upsert records with id1 and id2. The rider and driver names in the
         * update records will be rider-004 and driver-004. After the upsert is complete, all the records with id1 in File
         * 1 and File 2 must be updated, all the records with id2 in File 2 must also be updated. Also, none of the other
         * records in File 1, File 2 and File 3 must be updated.
         */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> updateRecords = new ArrayList<>();
        // This exists in 001 and 002 and should be updated in both
        sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
        updateRecords.add(sameAsRecord1);
        // This exists in 001 and should be updated
        HoodieRecord sameAsRecord2 = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
        updateRecords.add(sameAsRecord2);
        JavaRDD<HoodieRecord> updateRecordsRDD = jsc.parallelize(updateRecords, 1);
        statuses = client.upsert(updateRecordsRDD, newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        // verify there are now 4 commits
        timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(4, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting four commits.");
        assertEquals(timeline.lastInstant().get().getTimestamp(), newCommitTime, "Latest commit should be 004");
        // Check the entire dataset has 47 records still
        dataSet = getRecords();
        assertEquals(47, dataSet.count(), "Must contain 47 records");
        Row[] rows = (Row[]) dataSet.collect();
        int record1Count = 0;
        int record2Count = 0;
        for (Row row : rows) {
            if (row.getAs("_hoodie_record_key").equals(record1.getKey().getRecordKey())) {
                record1Count++;
                // assert each duplicate record is updated
                assertEquals(row.getAs("rider"), "rider-004");
                assertEquals(row.getAs("driver"), "driver-004");
            } else if (row.getAs("_hoodie_record_key").equals(record2.getKey().getRecordKey())) {
                record2Count++;
                // assert each duplicate record is updated
                assertEquals(row.getAs("rider"), "rider-004");
                assertEquals(row.getAs("driver"), "driver-004");
            } else {
                assertNotEquals(row.getAs("rider"), "rider-004");
                assertNotEquals(row.getAs("driver"), "rider-004");
            }
        }
        // Assert that id1 record count which has been updated to rider-004 and driver-004 is 22, which is the total
        // number of records with row_key id1
        assertEquals(22, record1Count);
        // Assert that id2 record count which has been updated to rider-004 and driver-004 is 21, which is the total
        // number of records with row_key id2
        assertEquals(21, record2Count);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 89 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieTimelineArchiver method testArchiveCommitSavepointNoHole.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testArchiveCommitSavepointNoHole(boolean enableMetadataTable) throws Exception {
    init();
    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table").withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withRemoteServerPort(timelineServicePort).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()).build();
    HoodieTestDataGenerator.createCommitFile(basePath, "100", wrapperFs.getConf());
    HoodieTestDataGenerator.createCommitFile(basePath, "101", wrapperFs.getConf());
    HoodieTestDataGenerator.createSavepointFile(basePath, "101", wrapperFs.getConf());
    HoodieTestDataGenerator.createCommitFile(basePath, "102", wrapperFs.getConf());
    HoodieTestDataGenerator.createCommitFile(basePath, "103", wrapperFs.getConf());
    HoodieTestDataGenerator.createCommitFile(basePath, "104", wrapperFs.getConf());
    HoodieTestDataGenerator.createCommitFile(basePath, "105", wrapperFs.getConf());
    HoodieTable table = HoodieSparkTable.create(cfg, context);
    HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table);
    if (enableMetadataTable) {
        // Simulate a compaction commit in metadata table timeline
        // so the archival in data table can happen
        createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "105");
    }
    HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
    assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match");
    assertTrue(archiver.archiveIfRequired(context));
    timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
    assertEquals(5, timeline.countInstants(), "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)");
    assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")), "Archived commits should always be safe");
    assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")), "Archived commits should always be safe");
    assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")), "Archived commits should always be safe");
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 90 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieTimelineArchiver method testArchiveCommitsWithCompactionCommitInMetadataTableTimeline.

@Test
public void testArchiveCommitsWithCompactionCommitInMetadataTableTimeline() throws Exception {
    HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 4, 20);
    int startInstantTime = 100;
    int numCommits = 15;
    // "100" till "105" should be archived in this case
    int numExpectedArchived = 6;
    for (int i = startInstantTime; i < startInstantTime + numCommits; i++) {
        HoodieTestDataGenerator.createCommitFile(basePath, Integer.toString(i), wrapperFs.getConf());
    }
    // Simulate a compaction commit in metadata table timeline
    // so the archival in data table can happen
    createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, "105");
    HoodieTable table = HoodieSparkTable.create(writeConfig, context);
    HoodieTimelineArchiver archiveLog = new HoodieTimelineArchiver(writeConfig, table);
    HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
    assertEquals(numCommits, timeline.countInstants(), String.format("Loaded %d commits and the count should match", numCommits));
    assertTrue(archiveLog.archiveIfRequired(context));
    timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
    assertEquals(numCommits - numExpectedArchived, timeline.countInstants(), "Since we have a compaction commit of 105 in metadata table timeline, we should never archive any commit after that");
    for (int i = startInstantTime + numExpectedArchived; i < startInstantTime + numCommits; i++) {
        assertTrue(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, Integer.toString(i))), String.format("Commit %d should not be archived", i));
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)118 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)74 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)39 List (java.util.List)36 IOException (java.io.IOException)34 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)34 ArrayList (java.util.ArrayList)32 Option (org.apache.hudi.common.util.Option)30 Collectors (java.util.stream.Collectors)29 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)29 HoodieException (org.apache.hudi.exception.HoodieException)26 Map (java.util.Map)25 FileStatus (org.apache.hadoop.fs.FileStatus)24 Path (org.apache.hadoop.fs.Path)24 Set (java.util.Set)22 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)22 FileSlice (org.apache.hudi.common.model.FileSlice)21 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)21 Pair (org.apache.hudi.common.util.collection.Pair)21 FSUtils (org.apache.hudi.common.fs.FSUtils)20