Search in sources :

Example 81 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class IncrSourceHelper method calculateBeginAndEndInstants.

/**
 * Find begin and end instants to be set for the next fetch.
 *
 * @param jssc                            Java Spark Context
 * @param srcBasePath                     Base path of Hudi source table
 * @param numInstantsPerFetch             Max Instants per fetch
 * @param beginInstant                    Last Checkpoint String
 * @param missingCheckpointStrategy when begin instant is missing, allow reading based on missing checkpoint strategy
 * @return begin and end instants along with query type.
 */
public static Pair<String, Pair<String, String>> calculateBeginAndEndInstants(JavaSparkContext jssc, String srcBasePath, int numInstantsPerFetch, Option<String> beginInstant, MissingCheckpointStrategy missingCheckpointStrategy) {
    ValidationUtils.checkArgument(numInstantsPerFetch > 0, "Make sure the config hoodie.deltastreamer.source.hoodieincr.num_instants is set to a positive value");
    HoodieTableMetaClient srcMetaClient = HoodieTableMetaClient.builder().setConf(jssc.hadoopConfiguration()).setBasePath(srcBasePath).setLoadActiveTimelineOnLoad(true).build();
    final HoodieTimeline activeCommitTimeline = srcMetaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants();
    String beginInstantTime = beginInstant.orElseGet(() -> {
        if (missingCheckpointStrategy != null) {
            if (missingCheckpointStrategy == MissingCheckpointStrategy.READ_LATEST) {
                Option<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
                return lastInstant.map(hoodieInstant -> getStrictlyLowerTimestamp(hoodieInstant.getTimestamp())).orElse(DEFAULT_BEGIN_TIMESTAMP);
            } else {
                return DEFAULT_BEGIN_TIMESTAMP;
            }
        } else {
            throw new IllegalArgumentException("Missing begin instant for incremental pull. For reading from latest " + "committed instant set hoodie.deltastreamer.source.hoodieincr.missing.checkpoint.strategy to a valid value");
        }
    });
    if (missingCheckpointStrategy == MissingCheckpointStrategy.READ_LATEST || !activeCommitTimeline.isBeforeTimelineStarts(beginInstantTime)) {
        Option<HoodieInstant> nthInstant = Option.fromJavaOptional(activeCommitTimeline.findInstantsAfter(beginInstantTime, numInstantsPerFetch).getInstants().reduce((x, y) -> y));
        return Pair.of(DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL(), Pair.of(beginInstantTime, nthInstant.map(HoodieInstant::getTimestamp).orElse(beginInstantTime)));
    } else {
        // when MissingCheckpointStrategy is set to read everything until latest, trigger snapshot query.
        Option<HoodieInstant> lastInstant = activeCommitTimeline.lastInstant();
        return Pair.of(DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL(), Pair.of(beginInstantTime, lastInstant.get().getTimestamp()));
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Objects(java.util.Objects) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) DataSourceReadOptions(org.apache.hudi.DataSourceReadOptions) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) Row(org.apache.spark.sql.Row) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Pair(org.apache.hudi.common.util.collection.Pair) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 82 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class CompactionAdminClient method unscheduleCompactionPlan.

/**
 * Un-schedules compaction plan. Remove All compaction operation scheduled and re-arrange delta-files that were
 * created after the compaction was scheduled.
 *
 * This operation MUST be executed with compactions and writer turned OFF.
 *
 * @param compactionInstant Compaction Instant
 * @param skipValidation Skip validation step
 * @param parallelism Parallelism
 * @param dryRun Dry Run
 */
public List<RenameOpResult> unscheduleCompactionPlan(String compactionInstant, boolean skipValidation, int parallelism, boolean dryRun) throws Exception {
    HoodieTableMetaClient metaClient = createMetaClient(false);
    List<Pair<HoodieLogFile, HoodieLogFile>> renameActions = getRenamingActionsForUnschedulingCompactionPlan(metaClient, compactionInstant, parallelism, Option.empty(), skipValidation);
    List<RenameOpResult> res = runRenamingOps(metaClient, renameActions, parallelism, dryRun);
    Option<Boolean> success = Option.fromJavaOptional(res.stream().map(r -> (r.isExecuted() && r.isSuccess())).reduce(Boolean::logicalAnd));
    Option<Boolean> allSuccess = success.isPresent() ? Option.of(success.get()) : Option.empty();
    // Only if all operations are successfully executed
    if (!dryRun && allSuccess.isPresent() && allSuccess.get()) {
        // Overwrite compaction request with empty compaction operations
        HoodieInstant inflight = new HoodieInstant(State.INFLIGHT, COMPACTION_ACTION, compactionInstant);
        Path inflightPath = new Path(metaClient.getMetaPath(), inflight.getFileName());
        if (metaClient.getFs().exists(inflightPath)) {
            // We need to rollback data-files because of this inflight compaction before unscheduling
            throw new IllegalStateException("Please rollback the inflight compaction before unscheduling");
        }
        // Leave the trace in aux folder but delete from metapath.
        // TODO: Add a rollback instant but for compaction
        HoodieInstant instant = new HoodieInstant(State.REQUESTED, COMPACTION_ACTION, compactionInstant);
        boolean deleted = metaClient.getFs().delete(new Path(metaClient.getMetaPath(), instant.getFileName()), false);
        ValidationUtils.checkArgument(deleted, "Unable to delete compaction instant.");
    }
    return res;
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) Pair(org.apache.hudi.common.util.collection.Pair)

Example 83 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class CompactionAdminClient method getRenamingActionsForUnschedulingCompactionOperation.

/**
 * Generate renaming actions for unscheduling a compaction operation NOTE: Can only be used safely when no writer
 * (ingestion/compaction) is running.
 *
 * @param metaClient Hoodie Table MetaClient
 * @param compactionInstant Compaction Instant
 * @param operation Compaction Operation
 * @param fsViewOpt Cached File System View
 * @param skipValidation Skip Validation
 * @return list of pairs of log-files (old, new) and for each pair, rename must be done to successfully unschedule
 *         compaction.
 */
public List<Pair<HoodieLogFile, HoodieLogFile>> getRenamingActionsForUnschedulingCompactionOperation(HoodieTableMetaClient metaClient, String compactionInstant, CompactionOperation operation, Option<HoodieTableFileSystemView> fsViewOpt, boolean skipValidation) throws IOException {
    List<Pair<HoodieLogFile, HoodieLogFile>> result = new ArrayList<>();
    HoodieTableFileSystemView fileSystemView = fsViewOpt.isPresent() ? fsViewOpt.get() : new HoodieTableFileSystemView(metaClient, metaClient.getCommitsAndCompactionTimeline());
    if (!skipValidation) {
        validateCompactionOperation(metaClient, compactionInstant, operation, Option.of(fileSystemView));
    }
    HoodieInstant lastInstant = metaClient.getCommitsAndCompactionTimeline().lastInstant().get();
    FileSlice merged = fileSystemView.getLatestMergedFileSlicesBeforeOrOn(operation.getPartitionPath(), lastInstant.getTimestamp()).filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
    List<HoodieLogFile> logFilesToRepair = merged.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(compactionInstant)).sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
    FileSlice fileSliceForCompaction = fileSystemView.getLatestFileSlicesBeforeOrOn(operation.getPartitionPath(), operation.getBaseInstantTime(), true).filter(fs -> fs.getFileId().equals(operation.getFileId())).findFirst().get();
    int maxUsedVersion = fileSliceForCompaction.getLogFiles().findFirst().map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION - 1);
    String logExtn = fileSliceForCompaction.getLogFiles().findFirst().map(lf -> "." + lf.getFileExtension()).orElse(HoodieLogFile.DELTA_EXTENSION);
    String parentPath = fileSliceForCompaction.getBaseFile().map(df -> new Path(df.getPath()).getParent().toString()).orElse(fileSliceForCompaction.getLogFiles().findFirst().map(lf -> lf.getPath().getParent().toString()).get());
    for (HoodieLogFile toRepair : logFilesToRepair) {
        int version = maxUsedVersion + 1;
        HoodieLogFile newLf = new HoodieLogFile(new Path(parentPath, FSUtils.makeLogFileName(operation.getFileId(), logExtn, operation.getBaseInstantTime(), version, HoodieLogFormat.UNKNOWN_WRITE_TOKEN)));
        result.add(Pair.of(toRepair, newLf));
        maxUsedVersion = version;
    }
    return result;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) OperationResult(org.apache.hudi.table.action.compact.OperationResult) FileStatus(org.apache.hadoop.fs.FileStatus) COMPACTION_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Pair(org.apache.hudi.common.util.collection.Pair)

Example 84 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieTimelineArchiver method getInstantsToArchive.

private Stream<HoodieInstant> getInstantsToArchive() {
    Stream<HoodieInstant> instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive());
    // For archiving and cleaning instants, we need to include intermediate state files if they exist
    HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
    Map<Pair<String, String>, List<HoodieInstant>> groupByTsAction = rawActiveTimeline.getInstants().collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(), HoodieInstant.getComparableAction(i.getAction()))));
    // metadata table.
    if (config.isMetadataTableEnabled()) {
        try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(), config.getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) {
            Option<String> latestCompactionTime = tableMetadata.getLatestCompactionTime();
            if (!latestCompactionTime.isPresent()) {
                LOG.info("Not archiving as there is no compaction yet on the metadata table");
                instants = Stream.empty();
            } else {
                LOG.info("Limiting archiving of instants to latest compaction on metadata table at " + latestCompactionTime.get());
                instants = instants.filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN, latestCompactionTime.get()));
            }
        } catch (Exception e) {
            throw new HoodieException("Error limiting instant archival based on metadata table", e);
        }
    }
    return instants.flatMap(hoodieInstant -> groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(), HoodieInstant.getComparableAction(hoodieInstant.getAction()))).stream());
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieArchivedMetaEntry(org.apache.hudi.avro.model.HoodieArchivedMetaEntry) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) Schema(org.apache.avro.Schema) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) CompactionTriggerStrategy(org.apache.hudi.table.action.compact.CompactionTriggerStrategy) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) MetadataConversionUtils(org.apache.hudi.client.utils.MetadataConversionUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) LESSER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) IOException(java.io.IOException) StorageSchemes(org.apache.hudi.common.fs.StorageSchemes) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) List(java.util.List) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieException(org.apache.hudi.exception.HoodieException) FileNotFoundException(java.io.FileNotFoundException) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair)

Example 85 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieGlobalSimpleIndex method getTaggedRecords.

/**
 * Tag records with right {@link HoodieRecordLocation}.
 *
 * @param incomingRecords incoming {@link HoodieRecord}s
 * @param existingRecords existing records with {@link HoodieRecordLocation}s
 * @return {@link HoodieData} of {@link HoodieRecord}s with tagged {@link HoodieRecordLocation}s
 */
private <R> HoodieData<HoodieRecord<R>> getTaggedRecords(HoodiePairData<String, HoodieRecord<R>> incomingRecords, HoodiePairData<HoodieKey, HoodieRecordLocation> existingRecords) {
    HoodiePairData<String, Pair<String, HoodieRecordLocation>> existingRecordByRecordKey = existingRecords.mapToPair(entry -> new ImmutablePair<>(entry.getLeft().getRecordKey(), Pair.of(entry.getLeft().getPartitionPath(), entry.getRight())));
    return incomingRecords.leftOuterJoin(existingRecordByRecordKey).values().flatMap(entry -> {
        HoodieRecord<R> inputRecord = entry.getLeft();
        Option<Pair<String, HoodieRecordLocation>> partitionPathLocationPair = Option.ofNullable(entry.getRight().orElse(null));
        List<HoodieRecord<R>> taggedRecords;
        if (partitionPathLocationPair.isPresent()) {
            String partitionPath = partitionPathLocationPair.get().getKey();
            HoodieRecordLocation location = partitionPathLocationPair.get().getRight();
            if (config.getGlobalSimpleIndexUpdatePartitionPath() && !(inputRecord.getPartitionPath().equals(partitionPath))) {
                // Create an empty record to delete the record in the old partition
                HoodieRecord<R> deleteRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), new EmptyHoodieRecordPayload());
                deleteRecord.setCurrentLocation(location);
                deleteRecord.seal();
                // Tag the incoming record for inserting to the new partition
                HoodieRecord<R> insertRecord = (HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty());
                taggedRecords = Arrays.asList(deleteRecord, insertRecord);
            } else {
                // Ignore the incoming record's partition, regardless of whether it differs from its old partition or not.
                // When it differs, the record will still be updated at its old partition.
                HoodieRecord<R> newRecord = new HoodieAvroRecord(new HoodieKey(inputRecord.getRecordKey(), partitionPath), (HoodieRecordPayload) inputRecord.getData());
                taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(newRecord, Option.ofNullable(location)));
            }
        } else {
            taggedRecords = Collections.singletonList((HoodieRecord<R>) HoodieIndexUtils.getTaggedRecord(inputRecord, Option.empty()));
        }
        return taggedRecords.iterator();
    });
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46