Search in sources :

Example 11 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHoodieMergeOnReadTable method testRollingStatsWithSmallFileHandling.

/**
 * Test to ensure rolling stats are correctly written to the metadata file, identifies small files and corrects them.
 */
@Test
public void testRollingStatsWithSmallFileHandling() throws Exception {
    HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        Map<String, Long> fileIdToInsertsMap = new HashMap<>();
        Map<String, Long> fileIdToUpsertsMap = new HashMap<>();
        String instantTime = "000";
        client.startCommitWithTime(instantTime);
        List<HoodieRecord> records = dataGen.generateInserts(instantTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        JavaRDD<WriteStatus> statuses = client.insert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        // Read from commit file
        HoodieTable table = HoodieSparkTable.create(cfg, context());
        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        int inserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                inserts += stat.getNumInserts();
                fileIdToInsertsMap.put(stat.getFileId(), stat.getNumInserts());
                fileIdToUpsertsMap.put(stat.getFileId(), stat.getNumUpdateWrites());
            }
        }
        assertEquals(200, inserts);
        instantTime = "001";
        client.startCommitWithTime(instantTime);
        // generate updates + inserts. inserts should be handled into small files
        records = dataGen.generateUpdates(instantTime, records);
        records.addAll(dataGen.generateInserts(instantTime, 200));
        writeRecords = jsc().parallelize(records, 1);
        statuses = client.upsert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        inserts = 0;
        int upserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                assertTrue(fileIdToInsertsMap.containsKey(stat.getFileId()));
                assertTrue(fileIdToUpsertsMap.containsKey(stat.getFileId()));
                inserts += stat.getNumInserts();
                upserts += stat.getNumUpdateWrites();
            }
        }
        assertEquals(200, inserts);
        assertEquals(200, upserts);
        // Test small file handling after compaction
        instantTime = "002";
        client.scheduleCompactionAtInstant(instantTime, Option.of(metadata.getExtraMetadata()));
        HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(instantTime);
        statuses = compactionMetadata.getWriteStatuses();
        client.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty());
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        HoodieCommitMetadata metadata1 = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getCommitsTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        // Ensure that the metadata stats from the extra metadata of delta commits is copied over to the compaction commit
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            assertTrue(metadata1.getPartitionToWriteStats().containsKey(pstat.getKey()));
            assertEquals(metadata1.getPartitionToWriteStats().get(pstat.getKey()).size(), pstat.getValue().size());
        }
        // Write inserts + updates
        instantTime = "003";
        client.startCommitWithTime(instantTime);
        // generate updates + inserts. inserts should be handled into small files
        records = dataGen.generateUpdates(instantTime, records);
        records.addAll(dataGen.generateInserts(instantTime, 200));
        writeRecords = jsc().parallelize(records, 1);
        statuses = client.upsert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        inserts = 0;
        upserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                assertTrue(fileIdToInsertsMap.containsKey(stat.getFileId()));
                inserts += stat.getNumInserts();
                upserts += stat.getNumUpdateWrites();
            }
        }
        assertEquals(200, inserts);
        assertEquals(400, upserts);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) List(java.util.List) Map(java.util.Map) HashMap(java.util.HashMap) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 12 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HoodieClientTestUtils method getLatestFileIDsToFullPath.

private static HashMap<String, String> getLatestFileIDsToFullPath(String basePath, HoodieTimeline commitTimeline, List<HoodieInstant> commitsToReturn) throws IOException {
    HashMap<String, String> fileIdToFullPath = new HashMap<>();
    for (HoodieInstant commit : commitsToReturn) {
        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get(), HoodieCommitMetadata.class);
        fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
    }
    return fileIdToFullPath;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HashMap(java.util.HashMap)

Example 13 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestCleanPlanExecutor method testKeepXHoursWithCleaning.

/**
 * Tests cleaning service based on number of hours retained.
 */
@ParameterizedTest
@MethodSource("argumentsForTestKeepLatestCommits")
public void testKeepXHoursWithCleaning(boolean simulateFailureRetry, boolean enableIncrementalClean, boolean enableBootstrapSourceClean) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withIncrementalCleaningMode(enableIncrementalClean).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER).withCleanBootstrapBaseFileEnabled(enableBootstrapSourceClean).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_BY_HOURS).cleanerNumHoursRetained(2).build()).build();
    HoodieTestTable testTable = HoodieTestTable.of(metaClient);
    String p0 = "2020/01/01";
    String p1 = "2020/01/02";
    Map<String, List<BootstrapFileMapping>> bootstrapMapping = enableBootstrapSourceClean ? generateBootstrapIndexAndSourceData(p0, p1) : null;
    String file1P0C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p0).get(0).getFileId() : UUID.randomUUID().toString();
    String file1P1C0 = enableBootstrapSourceClean ? bootstrapMapping.get(p1).get(0).getFileId() : UUID.randomUUID().toString();
    Instant instant = Instant.now();
    ZonedDateTime commitDateTime = ZonedDateTime.ofInstant(instant, ZoneId.systemDefault());
    int minutesForFirstCommit = 150;
    String firstCommitTs = HoodieActiveTimeline.formatDate(Date.from(commitDateTime.minusMinutes(minutesForFirstCommit).toInstant()));
    testTable.addInflightCommit(firstCommitTs).withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
    HoodieCommitMetadata commitMetadata = generateCommitMetadata(firstCommitTs, Collections.unmodifiableMap(new HashMap<String, List<String>>() {

        {
            put(p0, CollectionUtils.createImmutableList(file1P0C0));
            put(p1, CollectionUtils.createImmutableList(file1P1C0));
        }
    }));
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, firstCommitTs), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    metaClient = HoodieTableMetaClient.reload(metaClient);
    List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config, simulateFailureRetry);
    assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, firstCommitTs, file1P0C0));
    assertTrue(testTable.baseFileExists(p1, firstCommitTs, file1P1C0));
    // make next commit, with 1 insert & 1 update per partition
    int minutesForSecondCommit = 90;
    String secondCommitTs = HoodieActiveTimeline.formatDate(Date.from(commitDateTime.minusMinutes(minutesForSecondCommit).toInstant()));
    Map<String, String> partitionAndFileId002 = testTable.addInflightCommit(secondCommitTs).getFileIdsWithBaseFilesInPartitions(p0, p1);
    String file2P0C1 = partitionAndFileId002.get(p0);
    String file2P1C1 = partitionAndFileId002.get(p1);
    testTable.forCommit(secondCommitTs).withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
    commitMetadata = generateCommitMetadata(secondCommitTs, new HashMap<String, List<String>>() {

        {
            put(p0, CollectionUtils.createImmutableList(file1P0C0, file2P0C1));
            put(p1, CollectionUtils.createImmutableList(file1P1C0, file2P1C1));
        }
    });
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, secondCommitTs), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config, simulateFailureRetry);
    assertEquals(2, hoodieCleanStatsTwo.size(), "Should clean one file each from both the partitions");
    assertTrue(testTable.baseFileExists(p0, secondCommitTs, file2P0C1));
    assertTrue(testTable.baseFileExists(p1, secondCommitTs, file2P1C1));
    assertTrue(testTable.baseFileExists(p0, secondCommitTs, file1P0C0));
    assertTrue(testTable.baseFileExists(p1, secondCommitTs, file1P1C0));
    assertFalse(testTable.baseFileExists(p0, firstCommitTs, file1P0C0));
    assertFalse(testTable.baseFileExists(p1, firstCommitTs, file1P1C0));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HashMap(java.util.HashMap) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Instant(java.time.Instant) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ZonedDateTime(java.time.ZonedDateTime) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) List(java.util.List) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 14 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHoodieTimelineArchiver method testConvertCommitMetadata.

@Test
public void testConvertCommitMetadata() throws Exception {
    init();
    HoodieCommitMetadata hoodieCommitMetadata = new HoodieCommitMetadata();
    hoodieCommitMetadata.setOperationType(WriteOperationType.INSERT);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    org.apache.hudi.avro.model.HoodieCommitMetadata expectedCommitMetadata = MetadataConversionUtils.convertCommitMetadata(hoodieCommitMetadata);
    assertEquals(expectedCommitMetadata.getOperationType(), WriteOperationType.INSERT.toString());
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 15 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TestHoodieTimelineArchiver method testNoArchivalWithInflightCompactionInMiddle.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testNoArchivalWithInflightCompactionInMiddle(boolean enableMetadata) throws Exception {
    HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(enableMetadata, 2, 4, 2, 2, HoodieTableType.MERGE_ON_READ);
    // when max archival commits is set to 4, even after 7 commits, if there is an inflight compaction in the middle, archival should not kick in.
    HoodieCommitMetadata inflightCompactionMetadata = null;
    for (int i = 1; i < 8; i++) {
        if (i == 2) {
            inflightCompactionMetadata = testTable.doCompaction("0000000" + i, Arrays.asList("p1", "p2"), true);
        } else {
            testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
        }
        // archival
        Pair<List<HoodieInstant>, List<HoodieInstant>> commitsList = archiveAndGetCommitsList(writeConfig);
        List<HoodieInstant> originalCommits = commitsList.getKey();
        List<HoodieInstant> commitsAfterArchival = commitsList.getValue();
        if (enableMetadata) {
            assertEquals(originalCommits, commitsAfterArchival);
        } else {
            if (i != 6) {
                assertEquals(originalCommits, commitsAfterArchival);
            } else {
                // on 7th commit, archival will kick in. but will archive only one commit since 2nd compaction commit is inflight.
                assertEquals(originalCommits.size() - commitsAfterArchival.size(), 1);
                for (int j = 1; j <= 6; j++) {
                    if (j == 1) {
                        // first commit should be archived
                        assertFalse(commitsAfterArchival.contains(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j)));
                    } else if (j == 2) {
                        // 2nd compaction should not be archived
                        assertFalse(commitsAfterArchival.contains(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, "0000000" + j)));
                    } else {
                        // every other commit should not be archived
                        assertTrue(commitsAfterArchival.contains(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "0000000" + j)));
                    }
                }
            }
        }
    }
    // move inflight compaction to complete and add one regular write commit. archival should archive more commits.
    // an extra one commit is required, bcoz compaction in data table will not trigger table services in metadata table.
    // before this move, timeline : 2_inflight_compaction, 3,4,5,6,7.
    // after this move: 6,7,8 (2,3,4,5 will be archived)
    testTable.moveInflightCompactionToComplete("00000002", inflightCompactionMetadata);
    testTable.doWriteOperation("00000008", WriteOperationType.UPSERT, Arrays.asList("p1", "p2"), 2);
    Pair<List<HoodieInstant>, List<HoodieInstant>> commitsList = archiveAndGetCommitsList(writeConfig);
    List<HoodieInstant> commitsAfterArchival = commitsList.getValue();
    List<HoodieInstant> archivedInstants = getAllArchivedCommitInstants(Arrays.asList("00000001", "00000003", "00000004", "00000005", "00000006"), HoodieTimeline.DELTA_COMMIT_ACTION);
    archivedInstants.add(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "00000002"));
    archivedInstants.add(new HoodieInstant(State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "00000002"));
    verifyArchival(archivedInstants, getActiveCommitInstants(Arrays.asList("00000007", "00000008"), HoodieTimeline.DELTA_COMMIT_ACTION), commitsAfterArchival);
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) ArrayList(java.util.ArrayList) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)139 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)64 ArrayList (java.util.ArrayList)54 HashMap (java.util.HashMap)49 List (java.util.List)48 HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)44 IOException (java.io.IOException)42 Test (org.junit.jupiter.api.Test)41 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)40 Map (java.util.Map)38 Path (org.apache.hadoop.fs.Path)36 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)34 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)34 File (java.io.File)26 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 Option (org.apache.hudi.common.util.Option)25 Schema (org.apache.avro.Schema)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)21 Collectors (java.util.stream.Collectors)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20