Search in sources :

Example 71 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieDeltaStreamerWithMultiWriter method testUpsertsContinuousModeWithMultipleWritersForConflicts.

@ParameterizedTest
@EnumSource(HoodieTableType.class)
void testUpsertsContinuousModeWithMultipleWritersForConflicts(HoodieTableType tableType) throws Exception {
    // NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts
    basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString();
    propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER;
    tableBasePath = basePath + "/testtable_" + tableType;
    prepareInitialConfigs(fs(), basePath, "foo");
    TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
    props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
    props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
    // Keep it higher than batch-size to test continuous mode
    int totalRecords = 3000;
    HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    prepJobConfig.continuousMode = true;
    prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc());
    // Prepare base dataset with some commits
    deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> {
        if (tableType.equals(HoodieTableType.MERGE_ON_READ)) {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs());
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs());
        } else {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs());
        }
        TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        return true;
    });
    HoodieDeltaStreamer.Config cfgIngestionJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    cfgIngestionJob.continuousMode = true;
    cfgIngestionJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    cfgIngestionJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    // create a backfill job
    HoodieDeltaStreamer.Config cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    cfgBackfillJob.continuousMode = false;
    HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build();
    HoodieTimeline timeline = meta.reloadActiveTimeline().getCommitsTimeline().filterCompletedInstants();
    HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class);
    cfgBackfillJob.checkpoint = commitMetadata.getMetadata(CHECKPOINT_KEY);
    cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc());
    // re-init ingestion job to start sync service
    HoodieDeltaStreamer ingestionJob2 = new HoodieDeltaStreamer(cfgIngestionJob, jsc());
    // run ingestion & backfill in parallel, create conflict and fail one
    runJobsInParallel(tableBasePath, tableType, totalRecords, ingestionJob2, cfgIngestionJob, backfillJob, cfgBackfillJob, true, "batch1");
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 72 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class IncrSourceHelper method calculateBeginAndEndInstants.

/**
 * Find begin and end instants to be set for the next fetch.
 *
 * @param jssc                            Java Spark Context
 * @param srcBasePath                     Base path of Hudi source table
 * @param numInstantsPerFetch             Max Instants per fetch
 * @param beginInstant                    Last Checkpoint String
 * @param missingCheckpointStrategy when begin instant is missing, allow reading based on missing checkpoint strategy
 * @return begin and end instants along with query type.
 */
public static Pair<String, Pair<String, String>> calculateBeginAndEndInstants(JavaSparkContext jssc, String srcBasePath, int numInstantsPerFetch, Option<String> beginInstant, MissingCheckpointStrategy missingCheckpointStrategy) {
    ValidationUtils.checkArgument(numInstantsPerFetch > 0, "Make sure the config hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value");
    HoodieTableMetaClient srcMetaClient = HoodieTableMetaClient.builder().setConf(jssc.hadoopConfiguration()).setBasePath(srcBasePath).setLoadActiveTimelineOnLoad(true).build();
    final HoodieTimeline activeCommitTimeline = srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
    String beginInstantTime = beginInstant.orElseGet(() -> {
        if (missingCheckpointStrategy != null) {
            if (missingCheckpointStrategy == MissingCheckpointStrategy.READ_LATEST) {
                Option<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
                return lastInstant.map(hoodieInstant -> getStrictlyLowerTimestamp(hoodieInstant.getTimestamp())).orElse(DEFAULT_BEGIN_TIMESTAMP);
            } else {
                return DEFAULT_BEGIN_TIMESTAMP;
            }
        } else {
            throw new IllegalArgumentException("Missing begin instant for incremental pull. For reading from latest " + "committed instant set hoodie.deltastreamer.source.hoodieincr.missing.checkpoint.strategy to a valid value");
        }
    });
    if (missingCheckpointStrategy == MissingCheckpointStrategy.READ_LATEST || !activeCommitTimeline.isBeforeTimelineStarts(beginInstantTime)) {
        Option<HoodieInstant> nthInstant = Option.fromJavaOptional(activeCommitTimeline.findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y));
        return Pair.of(DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL(), Pair.of(beginInstantTime, nthInstant.map(HoodieInstant::getTimestamp).orElse(beginInstantTime)));
    } else {
        // when MissingCheckpointStrategy is set to read everything until latest, trigger snapshot query.
        Option<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
        return Pair.of(DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL(), Pair.of(beginInstantTime, lastInstant.get().getTimestamp()));
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Objects(java.util.Objects) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) DataSourceReadOptions(org.apache.hudi.DataSourceReadOptions) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) Row(org.apache.spark.sql.Row) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Pair(org.apache.hudi.common.util.collection.Pair) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 73 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class BaseHoodieWriteClient method rollbackFailedBootstrap.

/**
 * Main API to rollback failed bootstrap.
 */
protected void rollbackFailedBootstrap() {
    LOG.info("Rolling back pending bootstrap if present");
    HoodieTable<T, I, K, O> table = createTable(config, hadoopConf, config.isMetadataTableEnabled());
    HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction();
    Option<String> instant = Option.fromJavaOptional(inflightTimeline.getReverseOrderedInstants().map(HoodieInstant::getTimestamp).findFirst());
    if (instant.isPresent() && HoodieTimeline.compareTimestamps(instant.get(), HoodieTimeline.LESSER_THAN_OR_EQUALS, HoodieTimeline.FULL_BOOTSTRAP_INSTANT_TS)) {
        LOG.info("Found pending bootstrap instants. Rolling them back");
        table.rollbackBootstrap(context, HoodieActiveTimeline.createNewInstantTime());
        LOG.info("Finished rolling back pending bootstrap");
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 74 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class HoodieTimelineArchiver method getCommitInstantsToArchive.

private Stream<HoodieInstant> getCommitInstantsToArchive() {
    // TODO (na) : Add a way to return actions associated with a timeline and then merge/unify
    // with logic above to avoid Stream.concat
    HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
    Option<HoodieInstant> oldestPendingCompactionAndReplaceInstant = table.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMPACTION_ACTION, HoodieTimeline.REPLACE_COMMIT_ACTION)).filter(s -> !s.isCompleted()).firstInstant();
    Option<HoodieInstant> oldestInflightCommitInstant = table.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)).filterInflights().firstInstant();
    // We cannot have any holes in the commit timeline. We cannot archive any commits which are
    // made after the first savepoint present.
    Option<HoodieInstant> firstSavepoint = table.getCompletedSavepointTimeline().firstInstant();
    if (!commitTimeline.empty() && commitTimeline.countInstants() > maxInstantsToKeep) {
        // For Merge-On-Read table, inline or async compaction is enabled
        // We need to make sure that there are enough delta commits in the active timeline
        // to trigger compaction scheduling, when the trigger strategy of compaction is
        // NUM_COMMITS or NUM_AND_TIME.
        Option<HoodieInstant> oldestInstantToRetainForCompaction = (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ && (config.getInlineCompactTriggerStrategy() == CompactionTriggerStrategy.NUM_COMMITS || config.getInlineCompactTriggerStrategy() == CompactionTriggerStrategy.NUM_AND_TIME)) ? CompactionUtils.getOldestInstantToRetainForCompaction(table.getActiveTimeline(), config.getInlineCompactDeltaCommitMax()) : Option.empty();
        // Actually do the commits
        Stream<HoodieInstant> instantToArchiveStream = commitTimeline.getInstants().filter(s -> {
            // if no savepoint present, then don't filter
            return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(), LESSER_THAN_OR_EQUALS, s.getTimestamp()));
        }).filter(s -> {
            // Ensure commits >= oldest pending compaction commit is retained
            return oldestPendingCompactionAndReplaceInstant.map(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp())).orElse(true);
        }).filter(s -> {
            // get archived, i.e, instants after the oldestInflight are retained on the timeline
            if (config.getFailedWritesCleanPolicy() == HoodieFailedWritesCleaningPolicy.LAZY) {
                return oldestInflightCommitInstant.map(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp())).orElse(true);
            }
            return true;
        }).filter(s -> oldestInstantToRetainForCompaction.map(instantToRetain -> HoodieTimeline.compareTimestamps(s.getTimestamp(), LESSER_THAN, instantToRetain.getTimestamp())).orElse(true));
        return instantToArchiveStream.limit(commitTimeline.countInstants() - minInstantsToKeep);
    } else {
        return Stream.empty();
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieArchivedMetaEntry(org.apache.hudi.avro.model.HoodieArchivedMetaEntry) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) Schema(org.apache.avro.Schema) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) CompactionTriggerStrategy(org.apache.hudi.table.action.compact.CompactionTriggerStrategy) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) MetadataConversionUtils(org.apache.hudi.client.utils.MetadataConversionUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) LESSER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) IOException(java.io.IOException) StorageSchemes(org.apache.hudi.common.fs.StorageSchemes) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 75 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class FileSystemViewManager method createSpillableMapBasedFileSystemView.

/**
 * Create a spillable Map based file System view for a table.
 *
 * @param conf Hadoop Configuration
 * @param viewConf View Storage Configuration
 * @param metaClient HoodieTableMetaClient
 * @return
 */
private static SpillableMapBasedFileSystemView createSpillableMapBasedFileSystemView(SerializableConfiguration conf, FileSystemViewStorageConfig viewConf, HoodieTableMetaClient metaClient, HoodieCommonConfig commonConfig) {
    LOG.info("Creating SpillableMap based view for basePath " + metaClient.getBasePath());
    HoodieTimeline timeline = metaClient.getActiveTimeline().filterCompletedAndCompactionInstants();
    return new SpillableMapBasedFileSystemView(metaClient, timeline, viewConf, commonConfig);
}
Also used : HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Aggregations

HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)118 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)74 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)39 List (java.util.List)36 IOException (java.io.IOException)34 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)34 ArrayList (java.util.ArrayList)32 Option (org.apache.hudi.common.util.Option)30 Collectors (java.util.stream.Collectors)29 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)29 HoodieException (org.apache.hudi.exception.HoodieException)26 Map (java.util.Map)25 FileStatus (org.apache.hadoop.fs.FileStatus)24 Path (org.apache.hadoop.fs.Path)24 Set (java.util.Set)22 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)22 FileSlice (org.apache.hudi.common.model.FileSlice)21 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)21 Pair (org.apache.hudi.common.util.collection.Pair)21 FSUtils (org.apache.hudi.common.fs.FSUtils)20