Search in sources :

Example 41 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class TestHoodieTimelineArchiver method testArchiveCompletedRollbackAndClean.

@ParameterizedTest
@CsvSource({ "true,true", "true,false", "false,true", "false,false" })
public void testArchiveCompletedRollbackAndClean(boolean isEmpty, boolean enableMetadataTable) throws Exception {
    init();
    int minInstantsToKeep = 2;
    int maxInstantsToKeep = 10;
    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table").withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(minInstantsToKeep, maxInstantsToKeep).build()).withFileSystemViewConfig(FileSystemViewStorageConfig.newBuilder().withRemoteServerPort(timelineServicePort).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(enableMetadataTable).build()).build();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    int startInstant = 1;
    for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant++) {
        createCleanMetadata(startInstant + "", false, isEmpty || i % 2 == 0);
    }
    for (int i = 0; i < maxInstantsToKeep + 1; i++, startInstant += 2) {
        createCommitAndRollbackFile(startInstant + 1 + "", startInstant + "", false, isEmpty || i % 2 == 0);
    }
    if (enableMetadataTable) {
        // Simulate a compaction commit in metadata table timeline
        // so the archival in data table can happen
        createCompactionCommitInMetadataTable(hadoopConf, wrapperFs, basePath, Integer.toString(99));
    }
    HoodieTable table = HoodieSparkTable.create(cfg, context, metaClient);
    HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(cfg, table);
    archiver.archiveIfRequired(context);
    Stream<HoodieInstant> currentInstants = metaClient.getActiveTimeline().reload().getInstants();
    Map<Object, List<HoodieInstant>> actionInstantMap = currentInstants.collect(Collectors.groupingBy(HoodieInstant::getAction));
    assertTrue(actionInstantMap.containsKey("clean"), "Clean Action key must be preset");
    assertEquals(minInstantsToKeep, actionInstantMap.get("clean").size(), "Should have min instant");
    assertTrue(actionInstantMap.containsKey("rollback"), "Rollback Action key must be preset");
    assertEquals(minInstantsToKeep, actionInstantMap.get("rollback").size(), "Should have min instant");
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) ArrayList(java.util.ArrayList) CsvSource(org.junit.jupiter.params.provider.CsvSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 42 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class TestHoodieTimelineArchiver method verifyArchival.

private void verifyArchival(List<HoodieInstant> expectedArchivedInstants, List<HoodieInstant> expectedActiveInstants, List<HoodieInstant> commitsAfterArchival) {
    Collections.sort(expectedActiveInstants, Comparator.comparing(HoodieInstant::getTimestamp));
    Collections.sort(commitsAfterArchival, Comparator.comparing(HoodieInstant::getTimestamp));
    assertEquals(expectedActiveInstants, commitsAfterArchival);
    expectedArchivedInstants.forEach(entry -> assertFalse(commitsAfterArchival.contains(entry)));
    HoodieArchivedTimeline archivedTimeline = new HoodieArchivedTimeline(metaClient);
    List<HoodieInstant> actualArchivedInstants = archivedTimeline.getInstants().collect(Collectors.toList());
    Collections.sort(actualArchivedInstants, Comparator.comparing(HoodieInstant::getTimestamp));
    Collections.sort(expectedArchivedInstants, Comparator.comparing(HoodieInstant::getTimestamp));
    assertEquals(actualArchivedInstants, expectedArchivedInstants);
    HoodieTimeline timeline = metaClient.getActiveTimeline();
    expectedArchivedInstants.forEach(entry -> {
        // check safety
        if (entry.getAction() != HoodieTimeline.ROLLBACK_ACTION) {
            assertTrue(timeline.containsOrBeforeTimelineStarts(entry.getTimestamp()), "Archived commits should always be safe");
        }
    });
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline)

Example 43 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class SparkClientFunctionalTestHarness method updateRecordsInMORTable.

protected void updateRecordsInMORTable(HoodieTableMetaClient metaClient, List<HoodieRecord> records, SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime, boolean doExplicitCommit) throws IOException {
    HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(metaClient);
    Map<HoodieKey, HoodieRecord> recordsMap = new HashMap<>();
    for (HoodieRecord rec : records) {
        if (!recordsMap.containsKey(rec.getKey())) {
            recordsMap.put(rec.getKey(), rec);
        }
    }
    JavaRDD<WriteStatus> statusesRdd = client.upsert(jsc().parallelize(records, 1), commitTime);
    List<WriteStatus> statuses = statusesRdd.collect();
    // Verify there are no errors
    assertNoWriteErrors(statuses);
    if (doExplicitCommit) {
        client.commit(commitTime, statusesRdd);
    }
    assertFileSizesEqual(statuses, status -> FSUtils.getFileSize(reloadedMetaClient.getFs(), new Path(reloadedMetaClient.getBasePath(), status.getStat().getPath())));
    Option<HoodieInstant> deltaCommit = reloadedMetaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
    assertTrue(deltaCommit.isPresent());
    assertEquals(commitTime, deltaCommit.get().getTimestamp(), "Latest Delta commit should match specified time");
    Option<HoodieInstant> commit = reloadedMetaClient.getActiveTimeline().getCommitTimeline().firstInstant();
    assertFalse(commit.isPresent());
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 44 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class DeltaSync method getCheckpointToResume.

/**
 * Process previous commit metadata and checkpoint configs set by user to determine the checkpoint to resume from.
 * @param commitTimelineOpt commit timeline of interest.
 * @return the checkpoint to resume from if applicable.
 * @throws IOException
 */
private Option<String> getCheckpointToResume(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
    Option<String> resumeCheckpointStr = Option.empty();
    Option<HoodieInstant> lastCommit = commitTimelineOpt.get().lastInstant();
    if (lastCommit.isPresent()) {
        // if previous commit metadata did not have the checkpoint key, try traversing previous commits until we find one.
        Option<HoodieCommitMetadata> commitMetadataOption = getLatestCommitMetadataWithValidCheckpointInfo(commitTimelineOpt.get());
        if (commitMetadataOption.isPresent()) {
            HoodieCommitMetadata commitMetadata = commitMetadataOption.get();
            LOG.debug("Checkpoint reset from metadata: " + commitMetadata.getMetadata(CHECKPOINT_RESET_KEY));
            if (cfg.checkpoint != null && (StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)) || !cfg.checkpoint.equals(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)))) {
                resumeCheckpointStr = Option.of(cfg.checkpoint);
            } else if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_KEY))) {
                // if previous checkpoint is an empty string, skip resume use Option.empty()
                resumeCheckpointStr = Option.of(commitMetadata.getMetadata(CHECKPOINT_KEY));
            } else if (HoodieTimeline.compareTimestamps(HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, HoodieTimeline.LESSER_THAN, lastCommit.get().getTimestamp())) {
                throw new HoodieDeltaStreamerException("Unable to find previous checkpoint. Please double check if this table " + "was indeed built via delta streamer. Last Commit :" + lastCommit + ", Instants :" + commitTimelineOpt.get().getInstants().collect(Collectors.toList()) + ", CommitMetadata=" + commitMetadata.toJsonString());
            }
            // KAFKA_CHECKPOINT_TYPE will be honored only for first batch.
            if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY))) {
                props.remove(KafkaOffsetGen.Config.KAFKA_CHECKPOINT_TYPE.key());
            }
        } else if (cfg.checkpoint != null) {
            // getLatestCommitMetadataWithValidCheckpointInfo(commitTimelineOpt.get()) will never return a commit metadata w/o any checkpoint key set.
            resumeCheckpointStr = Option.of(cfg.checkpoint);
        }
    }
    return resumeCheckpointStr;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException)

Example 45 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class HoodieClusteringJob method doScheduleAndCluster.

private int doScheduleAndCluster(JavaSparkContext jsc) throws Exception {
    LOG.info("Step 1: Do schedule");
    String schemaStr = getSchemaFromLatestInstant();
    try (SparkRDDWriteClient<HoodieRecordPayload> client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
        Option<String> instantTime = Option.empty();
        if (cfg.retryLastFailedClusteringJob) {
            HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(client.getConfig(), client.getEngineContext());
            HoodieTimeline inflightHoodieTimeline = table.getActiveTimeline().filterPendingReplaceTimeline().filterInflights();
            if (!inflightHoodieTimeline.empty()) {
                HoodieInstant inflightClusteringInstant = inflightHoodieTimeline.lastInstant().get();
                Date clusteringStartTime = HoodieActiveTimeline.parseDateFromInstantTime(inflightClusteringInstant.getTimestamp());
                if (clusteringStartTime.getTime() + cfg.maxProcessingTimeMs < System.currentTimeMillis()) {
                    // if there has failed clustering, then we will use the failed clustering instant-time to trigger next clustering action which will rollback and clustering.
                    LOG.info("Found failed clustering instant at : " + inflightClusteringInstant + "; Will rollback the failed clustering and re-trigger again.");
                    instantTime = Option.of(inflightHoodieTimeline.lastInstant().get().getTimestamp());
                } else {
                    LOG.info(inflightClusteringInstant + " might still be in progress, will trigger a new clustering job.");
                }
            }
        }
        instantTime = instantTime.isPresent() ? instantTime : doSchedule(client);
        if (!instantTime.isPresent()) {
            LOG.info("Couldn't generate cluster plan");
            return -1;
        }
        LOG.info("The schedule instant time is " + instantTime.get());
        LOG.info("Step 2: Do cluster");
        Option<HoodieCommitMetadata> metadata = client.cluster(instantTime.get(), true).getCommitMetadata();
        return UtilHelpers.handleErrors(metadata.get(), instantTime.get());
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Date(java.util.Date)

Aggregations

HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)323 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)129 ArrayList (java.util.ArrayList)118 List (java.util.List)116 IOException (java.io.IOException)112 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)104 Test (org.junit.jupiter.api.Test)97 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)96 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)89 Map (java.util.Map)84 Option (org.apache.hudi.common.util.Option)84 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)84 Collectors (java.util.stream.Collectors)83 HashMap (java.util.HashMap)81 Path (org.apache.hadoop.fs.Path)78 Pair (org.apache.hudi.common.util.collection.Pair)71 Logger (org.apache.log4j.Logger)67 LogManager (org.apache.log4j.LogManager)66 HoodieIOException (org.apache.hudi.exception.HoodieIOException)65 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)61