Search in sources :

Example 6 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class DeltaSync method syncOnce.

/**
 * Run one round of delta sync and return new compaction instant if one got scheduled.
 */
public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException {
    Pair<Option<String>, JavaRDD<WriteStatus>> result = null;
    Timer.Context overallTimerContext = metrics.getOverallTimerContext();
    // Refresh Timeline
    refreshTimeline();
    Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> srcRecordsWithCkpt = readFromSource(commitTimelineOpt);
    if (null != srcRecordsWithCkpt) {
        // compactor
        if (null == writeClient) {
            this.schemaProvider = srcRecordsWithCkpt.getKey();
            // Setup HoodieWriteClient and compaction now that we decided on schema
            setupWriteClient();
        } else {
            Schema newSourceSchema = srcRecordsWithCkpt.getKey().getSourceSchema();
            Schema newTargetSchema = srcRecordsWithCkpt.getKey().getTargetSchema();
            if (!(processedSchema.isSchemaPresent(newSourceSchema)) || !(processedSchema.isSchemaPresent(newTargetSchema))) {
                LOG.info("Seeing new schema. Source :" + newSourceSchema.toString(true) + ", Target :" + newTargetSchema.toString(true));
                // We need to recreate write client with new schema and register them.
                reInitWriteClient(newSourceSchema, newTargetSchema);
                processedSchema.addSchema(newSourceSchema);
                processedSchema.addSchema(newTargetSchema);
            }
        }
        // complete the pending clustering before writing to sink
        if (cfg.retryLastPendingInlineClusteringJob && getHoodieClientConfig(this.schemaProvider).inlineClusteringEnabled()) {
            Option<String> pendingClusteringInstant = getLastPendingClusteringInstant(allCommitsTimelineOpt);
            if (pendingClusteringInstant.isPresent()) {
                writeClient.cluster(pendingClusteringInstant.get(), true);
            }
        }
        result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext);
    }
    metrics.updateDeltaStreamerSyncMetrics(System.currentTimeMillis());
    // Clear persistent RDDs
    jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist);
    return result;
}
Also used : Timer(com.codahale.metrics.Timer) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) Option(org.apache.hudi.common.util.Option) JavaRDD(org.apache.spark.api.java.JavaRDD) Pair(org.apache.hudi.common.util.collection.Pair)

Example 7 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class HoodieRepairTool method doRepair.

/**
 * Does repair, either in REPAIR or DRY_RUN mode.
 *
 * @param startingInstantOption {@link Option} of starting instant for scanning, can be empty.
 * @param endingInstantOption   {@link Option} of ending instant for scanning, can be empty.
 * @param isDryRun              Is dry run.
 * @throws IOException upon errors.
 */
boolean doRepair(Option<String> startingInstantOption, Option<String> endingInstantOption, boolean isDryRun) throws IOException {
    // Scans all partitions to find base and log files in the base path
    List<Path> allFilesInPartitions = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
    // Buckets the files based on instant time
    // instant time -> relative paths of base and log files to base path
    Map<String, List<String>> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(metaClient.getBasePath(), allFilesInPartitions);
    List<String> instantTimesToRepair = instantToFilesMap.keySet().stream().filter(instant -> (!startingInstantOption.isPresent() || instant.compareTo(startingInstantOption.get()) >= 0) && (!endingInstantOption.isPresent() || instant.compareTo(endingInstantOption.get()) <= 0)).collect(Collectors.toList());
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
    HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline();
    // This assumes that the archived timeline only has completed instants so this is safe
    archivedTimeline.loadCompletedInstantDetailsInMemory();
    List<ImmutablePair<String, List<String>>> instantFilesToRemove = context.parallelize(instantTimesToRepair).map(instantToRepair -> new ImmutablePair<>(instantToRepair, RepairUtils.findInstantFilesToRemove(instantToRepair, instantToFilesMap.get(instantToRepair), activeTimeline, archivedTimeline))).collectAsList();
    List<ImmutablePair<String, List<String>>> instantsWithDanglingFiles = instantFilesToRemove.stream().filter(e -> !e.getValue().isEmpty()).collect(Collectors.toList());
    printRepairInfo(instantTimesToRepair, instantsWithDanglingFiles);
    if (!isDryRun) {
        List<String> relativeFilePathsToDelete = instantsWithDanglingFiles.stream().flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
        if (relativeFilePathsToDelete.size() > 0) {
            if (!backupFiles(relativeFilePathsToDelete)) {
                LOG.error("Error backing up dangling files. Exiting...");
                return false;
            }
            return deleteFiles(context, cfg.basePath, relativeFilePathsToDelete);
        }
        LOG.info(String.format("Table repair on %s is successful", cfg.basePath));
    }
    return true;
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) SecureRandom(java.security.SecureRandom) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) JCommander(com.beust.jcommander.JCommander) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) List(java.util.List) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RepairUtils(org.apache.hudi.table.repair.RepairUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ArrayList(java.util.ArrayList) List(java.util.List) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline)

Example 8 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class KafkaOffsetGen method getNextOffsetRanges.

public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit, HoodieDeltaStreamerMetrics metrics) {
    // Obtain current metadata for the topic
    Map<TopicPartition, Long> fromOffsets;
    Map<TopicPartition, Long> toOffsets;
    try (KafkaConsumer consumer = new KafkaConsumer(kafkaParams)) {
        if (!checkTopicExists(consumer)) {
            throw new HoodieException("Kafka topic:" + topicName + " does not exist");
        }
        List<PartitionInfo> partitionInfoList;
        partitionInfoList = consumer.partitionsFor(topicName);
        Set<TopicPartition> topicPartitions = partitionInfoList.stream().map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet());
        if (Config.KAFKA_CHECKPOINT_TYPE_TIMESTAMP.equals(kafkaCheckpointType) && isValidTimestampCheckpointType(lastCheckpointStr)) {
            lastCheckpointStr = getOffsetsByTimestamp(consumer, partitionInfoList, topicPartitions, topicName, Long.parseLong(lastCheckpointStr.get()));
        }
        // Determine the offset ranges to read from
        if (lastCheckpointStr.isPresent() && !lastCheckpointStr.get().isEmpty() && checkTopicCheckpoint(lastCheckpointStr)) {
            fromOffsets = fetchValidOffsets(consumer, lastCheckpointStr, topicPartitions);
            metrics.updateDeltaStreamerKafkaDelayCountMetrics(delayOffsetCalculation(lastCheckpointStr, topicPartitions, consumer));
        } else {
            switch(autoResetValue) {
                case EARLIEST:
                    fromOffsets = consumer.beginningOffsets(topicPartitions);
                    break;
                case LATEST:
                    fromOffsets = consumer.endOffsets(topicPartitions);
                    break;
                case GROUP:
                    fromOffsets = getGroupOffsets(consumer, topicPartitions);
                    break;
                default:
                    throw new HoodieNotSupportedException("Auto reset value must be one of 'earliest' or 'latest' or 'group' ");
            }
        }
        // Obtain the latest offsets.
        toOffsets = consumer.endOffsets(topicPartitions);
    }
    // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
    long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.key(), Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.defaultValue());
    long numEvents;
    if (sourceLimit == Long.MAX_VALUE) {
        numEvents = maxEventsToReadFromKafka;
        LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka);
    } else {
        numEvents = sourceLimit;
    }
    if (numEvents < toOffsets.size()) {
        throw new HoodieException("sourceLimit should not be less than the number of kafka partitions");
    }
    return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
}
Also used : Arrays(java.util.Arrays) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataSourceUtils(org.apache.hudi.DataSourceUtils) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) Function(java.util.function.Function) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) Map(java.util.Map) AvroKafkaSource(org.apache.hudi.utilities.sources.AvroKafkaSource) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) TopicPartition(org.apache.kafka.common.TopicPartition) TimeoutException(org.apache.kafka.common.errors.TimeoutException) TypedProperties(org.apache.hudi.common.config.TypedProperties) Set(java.util.Set) ConsumerConfig(org.apache.kafka.clients.consumer.ConsumerConfig) PartitionInfo(org.apache.kafka.common.PartitionInfo) OffsetAndTimestamp(org.apache.kafka.clients.consumer.OffsetAndTimestamp) Collectors(java.util.stream.Collectors) List(java.util.List) ConfigProperty(org.apache.hudi.common.config.ConfigProperty) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) CommitFailedException(org.apache.kafka.clients.consumer.CommitFailedException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) TopicPartition(org.apache.kafka.common.TopicPartition) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) HoodieException(org.apache.hudi.exception.HoodieException) PartitionInfo(org.apache.kafka.common.PartitionInfo) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException)

Example 9 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class HiveIncrPullSource method fetchNewData.

@Override
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
    try {
        // find the source commit to pull
        Option<String> commitToPull = findCommitToPull(lastCheckpointStr);
        if (!commitToPull.isPresent()) {
            return new InputBatch<>(Option.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
        }
        // read the files out.
        List<FileStatus> commitDeltaFiles = Arrays.asList(fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
        String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
        JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
        sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch new data");
        return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), String.valueOf(commitToPull.get()));
    } catch (IOException ioe) {
        throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) NullWritable(org.apache.hadoop.io.NullWritable) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) DataSourceUtils(org.apache.hudi.DataSourceUtils) AvroKey(org.apache.avro.mapred.AvroKey) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) Path(org.apache.hadoop.fs.Path) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) TypedProperties(org.apache.hudi.common.config.TypedProperties) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Collectors(java.util.stream.Collectors) AvroKeyInputFormat(org.apache.avro.mapreduce.AvroKeyInputFormat) List(java.util.List) HiveIncrementalPuller(org.apache.hudi.utilities.HiveIncrementalPuller) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) FileStatus(org.apache.hadoop.fs.FileStatus) AvroKey(org.apache.avro.mapred.AvroKey) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) NullWritable(org.apache.hadoop.io.NullWritable) HoodieIOException(org.apache.hudi.exception.HoodieIOException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 10 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class BaseHoodieWriteClient method tryUpgrade.

private void tryUpgrade(HoodieTableMetaClient metaClient, Option<String> instantTime) {
    UpgradeDowngrade upgradeDowngrade = new UpgradeDowngrade(metaClient, config, context, upgradeDowngradeHelper);
    if (upgradeDowngrade.needsUpgradeOrDowngrade(HoodieTableVersion.current())) {
        // Ensure no inflight commits by setting EAGER policy and explicitly cleaning all failed commits
        List<String> instantsToRollback = getInstantsToRollback(metaClient, HoodieFailedWritesCleaningPolicy.EAGER, instantTime);
        Map<String, Option<HoodiePendingRollbackInfo>> pendingRollbacks = getPendingRollbackInfos(metaClient);
        instantsToRollback.forEach(entry -> pendingRollbacks.putIfAbsent(entry, Option.empty()));
        rollbackFailedWrites(pendingRollbacks, true);
        new UpgradeDowngrade(metaClient, config, context, upgradeDowngradeHelper).run(HoodieTableVersion.current(), instantTime.orElse(null));
        metaClient.reloadActiveTimeline();
    }
}
Also used : SupportsUpgradeDowngrade(org.apache.hudi.table.upgrade.SupportsUpgradeDowngrade) UpgradeDowngrade(org.apache.hudi.table.upgrade.UpgradeDowngrade) Option(org.apache.hudi.common.util.Option)

Aggregations

Option (org.apache.hudi.common.util.Option)105 List (java.util.List)84 IOException (java.io.IOException)70 Collectors (java.util.stream.Collectors)69 Map (java.util.Map)67 ArrayList (java.util.ArrayList)61 Path (org.apache.hadoop.fs.Path)59 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)59 Pair (org.apache.hudi.common.util.collection.Pair)59 HashMap (java.util.HashMap)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)58 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)56 LogManager (org.apache.log4j.LogManager)54 Logger (org.apache.log4j.Logger)54 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)53 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)46 HoodieIOException (org.apache.hudi.exception.HoodieIOException)44 Arrays (java.util.Arrays)43 FSUtils (org.apache.hudi.common.fs.FSUtils)43 Collections (java.util.Collections)39