Search in sources :

Example 16 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class DeltaSync method getCheckpointToResume.

/**
 * Process previous commit metadata and checkpoint configs set by user to determine the checkpoint to resume from.
 * @param commitTimelineOpt commit timeline of interest.
 * @return the checkpoint to resume from if applicable.
 * @throws IOException
 */
private Option<String> getCheckpointToResume(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
    Option<String> resumeCheckpointStr = Option.empty();
    Option<HoodieInstant> lastCommit = commitTimelineOpt.get().lastInstant();
    if (lastCommit.isPresent()) {
        // if previous commit metadata did not have the checkpoint key, try traversing previous commits until we find one.
        Option<HoodieCommitMetadata> commitMetadataOption = getLatestCommitMetadataWithValidCheckpointInfo(commitTimelineOpt.get());
        if (commitMetadataOption.isPresent()) {
            HoodieCommitMetadata commitMetadata = commitMetadataOption.get();
            LOG.debug("Checkpoint reset from metadata: " + commitMetadata.getMetadata(CHECKPOINT_RESET_KEY));
            if (cfg.checkpoint != null && (StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)) || !cfg.checkpoint.equals(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY)))) {
                resumeCheckpointStr = Option.of(cfg.checkpoint);
            } else if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_KEY))) {
                // if previous checkpoint is an empty string, skip resume use Option.empty()
                resumeCheckpointStr = Option.of(commitMetadata.getMetadata(CHECKPOINT_KEY));
            } else if (HoodieTimeline.compareTimestamps(HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS, HoodieTimeline.LESSER_THAN, lastCommit.get().getTimestamp())) {
                throw new HoodieDeltaStreamerException("Unable to find previous checkpoint. Please double check if this table " + "was indeed built via delta streamer. Last Commit :" + lastCommit + ", Instants :" + commitTimelineOpt.get().getInstants().collect(Collectors.toList()) + ", CommitMetadata=" + commitMetadata.toJsonString());
            }
            // KAFKA_CHECKPOINT_TYPE will be honored only for first batch.
            if (!StringUtils.isNullOrEmpty(commitMetadata.getMetadata(CHECKPOINT_RESET_KEY))) {
                props.remove(KafkaOffsetGen.Config.KAFKA_CHECKPOINT_TYPE.key());
            }
        } else if (cfg.checkpoint != null) {
            // getLatestCommitMetadataWithValidCheckpointInfo(commitTimelineOpt.get()) will never return a commit metadata w/o any checkpoint key set.
            resumeCheckpointStr = Option.of(cfg.checkpoint);
        }
    }
    return resumeCheckpointStr;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException)

Example 17 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HoodieClusteringJob method doScheduleAndCluster.

private int doScheduleAndCluster(JavaSparkContext jsc) throws Exception {
    LOG.info("Step 1: Do schedule");
    String schemaStr = getSchemaFromLatestInstant();
    try (SparkRDDWriteClient<HoodieRecordPayload> client = UtilHelpers.createHoodieClient(jsc, cfg.basePath, schemaStr, cfg.parallelism, Option.empty(), props)) {
        Option<String> instantTime = Option.empty();
        if (cfg.retryLastFailedClusteringJob) {
            HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(client.getConfig(), client.getEngineContext());
            HoodieTimeline inflightHoodieTimeline = table.getActiveTimeline().filterPendingReplaceTimeline().filterInflights();
            if (!inflightHoodieTimeline.empty()) {
                HoodieInstant inflightClusteringInstant = inflightHoodieTimeline.lastInstant().get();
                Date clusteringStartTime = HoodieActiveTimeline.parseDateFromInstantTime(inflightClusteringInstant.getTimestamp());
                if (clusteringStartTime.getTime() + cfg.maxProcessingTimeMs < System.currentTimeMillis()) {
                    // if there has failed clustering, then we will use the failed clustering instant-time to trigger next clustering action which will rollback and clustering.
                    LOG.info("Found failed clustering instant at : " + inflightClusteringInstant + "; Will rollback the failed clustering and re-trigger again.");
                    instantTime = Option.of(inflightHoodieTimeline.lastInstant().get().getTimestamp());
                } else {
                    LOG.info(inflightClusteringInstant + " might still be in progress, will trigger a new clustering job.");
                }
            }
        }
        instantTime = instantTime.isPresent() ? instantTime : doSchedule(client);
        if (!instantTime.isPresent()) {
            LOG.info("Couldn't generate cluster plan");
            return -1;
        }
        LOG.info("The schedule instant time is " + instantTime.get());
        LOG.info("Step 2: Do cluster");
        Option<HoodieCommitMetadata> metadata = client.cluster(instantTime.get(), true).getCommitMetadata();
        return UtilHelpers.handleErrors(metadata.get(), instantTime.get());
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Date(java.util.Date)

Example 18 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HoodieTestCommitMetadataGenerator method generateCommitMetadata.

/**
 * Method to generate commit metadata.
 */
private static HoodieCommitMetadata generateCommitMetadata(Map<String, List<String>> partitionToFilePaths, Option<Integer> writes, Option<Integer> updates, Map<String, String> extraMetadata) {
    HoodieCommitMetadata metadata = new HoodieCommitMetadata();
    for (Map.Entry<String, String> entry : extraMetadata.entrySet()) {
        metadata.addMetadata(entry.getKey(), entry.getValue());
    }
    partitionToFilePaths.forEach((key, value) -> value.forEach(f -> {
        HoodieWriteStat writeStat = new HoodieWriteStat();
        writeStat.setPartitionPath(key);
        writeStat.setPath(DEFAULT_PATH);
        writeStat.setFileId(DEFAULT_FILEID);
        writeStat.setTotalWriteBytes(DEFAULT_TOTAL_WRITE_BYTES);
        writeStat.setPrevCommit(DEFAULT_PRE_COMMIT);
        writeStat.setNumWrites(writes.orElse(DEFAULT_NUM_WRITES));
        writeStat.setNumUpdateWrites(updates.orElse(DEFAULT_NUM_UPDATE_WRITES));
        writeStat.setTotalLogBlocks(DEFAULT_TOTAL_LOG_BLOCKS);
        writeStat.setTotalLogRecords(DEFAULT_TOTAL_LOG_RECORDS);
        metadata.addWriteStat(key, writeStat);
    }));
    return metadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Arrays(java.util.Arrays) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) HashMap(java.util.HashMap) UUID(java.util.UUID) FileCreateUtils.baseFileName(org.apache.hudi.common.testutils.FileCreateUtils.baseFileName) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) StandardCharsets(java.nio.charset.StandardCharsets) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) CollectionUtils.createImmutableList(org.apache.hudi.common.util.CollectionUtils.createImmutableList) Path(org.apache.hadoop.fs.Path) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HashMap(java.util.HashMap) Map(java.util.Map)

Example 19 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class TransactionUtils method resolveWriteConflictIfAny.

/**
 * Resolve any write conflicts when committing data.
 *
 * @param table
 * @param currentTxnOwnerInstant
 * @param thisCommitMetadata
 * @param config
 * @param lastCompletedTxnOwnerInstant
 * @return
 * @throws HoodieWriteConflictException
 */
public static Option<HoodieCommitMetadata> resolveWriteConflictIfAny(final HoodieTable table, final Option<HoodieInstant> currentTxnOwnerInstant, final Option<HoodieCommitMetadata> thisCommitMetadata, final HoodieWriteConfig config, Option<HoodieInstant> lastCompletedTxnOwnerInstant, boolean reloadActiveTimeline) throws HoodieWriteConflictException {
    if (config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl()) {
        ConflictResolutionStrategy resolutionStrategy = config.getWriteConflictResolutionStrategy();
        Stream<HoodieInstant> instantStream = resolutionStrategy.getCandidateInstants(reloadActiveTimeline ? table.getMetaClient().reloadActiveTimeline() : table.getActiveTimeline(), currentTxnOwnerInstant.get(), lastCompletedTxnOwnerInstant);
        final ConcurrentOperation thisOperation = new ConcurrentOperation(currentTxnOwnerInstant.get(), thisCommitMetadata.orElse(new HoodieCommitMetadata()));
        instantStream.forEach(instant -> {
            try {
                ConcurrentOperation otherOperation = new ConcurrentOperation(instant, table.getMetaClient());
                if (resolutionStrategy.hasConflict(thisOperation, otherOperation)) {
                    LOG.info("Conflict encountered between current instant = " + thisOperation + " and instant = " + otherOperation + ", attempting to resolve it...");
                    resolutionStrategy.resolveConflict(table, thisOperation, otherOperation);
                }
            } catch (IOException io) {
                throw new HoodieWriteConflictException("Unable to resolve conflict, if present", io);
            }
        });
        LOG.info("Successfully resolved conflicts, if any");
        return thisOperation.getCommitMetadataOption();
    }
    return thisCommitMetadata;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) ConcurrentOperation(org.apache.hudi.client.transaction.ConcurrentOperation) ConflictResolutionStrategy(org.apache.hudi.client.transaction.ConflictResolutionStrategy) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieWriteConflictException(org.apache.hudi.exception.HoodieWriteConflictException)

Example 20 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class CommitUtils method buildMetadata.

public static HoodieCommitMetadata buildMetadata(List<HoodieWriteStat> writeStats, Map<String, List<String>> partitionToReplaceFileIds, Option<Map<String, String>> extraMetadata, WriteOperationType operationType, String schemaToStoreInCommit, String commitActionType) {
    HoodieCommitMetadata commitMetadata = buildMetadataFromStats(writeStats, partitionToReplaceFileIds, commitActionType, operationType);
    // add in extra metadata
    if (extraMetadata.isPresent()) {
        extraMetadata.get().forEach(commitMetadata::addMetadata);
    }
    commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, (schemaToStoreInCommit == null || schemaToStoreInCommit.equals(NULL_SCHEMA_STR)) ? "" : schemaToStoreInCommit);
    commitMetadata.setOperationType(operationType);
    return commitMetadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata)

Aggregations

HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)139 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)64 ArrayList (java.util.ArrayList)54 HashMap (java.util.HashMap)49 List (java.util.List)48 HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)44 IOException (java.io.IOException)42 Test (org.junit.jupiter.api.Test)41 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)40 Map (java.util.Map)38 Path (org.apache.hadoop.fs.Path)36 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)34 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)34 File (java.io.File)26 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 Option (org.apache.hudi.common.util.Option)25 Schema (org.apache.avro.Schema)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)21 Collectors (java.util.stream.Collectors)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20