use of org.apache.hudi.common.util.Option in project hudi by apache.
the class DeltaSync method syncOnce.
/**
* Run one round of delta sync and return new compaction instant if one got scheduled.
*/
public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException {
Pair<Option<String>, JavaRDD<WriteStatus>> result = null;
Timer.Context overallTimerContext = metrics.getOverallTimerContext();
// Refresh Timeline
refreshTimeline();
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> srcRecordsWithCkpt = readFromSource(commitTimelineOpt);
if (null != srcRecordsWithCkpt) {
// compactor
if (null == writeClient) {
this.schemaProvider = srcRecordsWithCkpt.getKey();
// Setup HoodieWriteClient and compaction now that we decided on schema
setupWriteClient();
} else {
Schema newSourceSchema = srcRecordsWithCkpt.getKey().getSourceSchema();
Schema newTargetSchema = srcRecordsWithCkpt.getKey().getTargetSchema();
if (!(processedSchema.isSchemaPresent(newSourceSchema)) || !(processedSchema.isSchemaPresent(newTargetSchema))) {
LOG.info("Seeing new schema. Source :" + newSourceSchema.toString(true) + ", Target :" + newTargetSchema.toString(true));
// We need to recreate write client with new schema and register them.
reInitWriteClient(newSourceSchema, newTargetSchema);
processedSchema.addSchema(newSourceSchema);
processedSchema.addSchema(newTargetSchema);
}
}
// complete the pending clustering before writing to sink
if (cfg.retryLastPendingInlineClusteringJob && getHoodieClientConfig(this.schemaProvider).inlineClusteringEnabled()) {
Option<String> pendingClusteringInstant = getLastPendingClusteringInstant(allCommitsTimelineOpt);
if (pendingClusteringInstant.isPresent()) {
writeClient.cluster(pendingClusteringInstant.get(), true);
}
}
result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext);
}
metrics.updateDeltaStreamerSyncMetrics(System.currentTimeMillis());
// Clear persistent RDDs
jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist);
return result;
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class HoodieRepairTool method doRepair.
/**
* Does repair, either in REPAIR or DRY_RUN mode.
*
* @param startingInstantOption {@link Option} of starting instant for scanning, can be empty.
* @param endingInstantOption {@link Option} of ending instant for scanning, can be empty.
* @param isDryRun Is dry run.
* @throws IOException upon errors.
*/
boolean doRepair(Option<String> startingInstantOption, Option<String> endingInstantOption, boolean isDryRun) throws IOException {
// Scans all partitions to find base and log files in the base path
List<Path> allFilesInPartitions = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
// Buckets the files based on instant time
// instant time -> relative paths of base and log files to base path
Map<String, List<String>> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(metaClient.getBasePath(), allFilesInPartitions);
List<String> instantTimesToRepair = instantToFilesMap.keySet().stream().filter(instant -> (!startingInstantOption.isPresent() || instant.compareTo(startingInstantOption.get()) >= 0) && (!endingInstantOption.isPresent() || instant.compareTo(endingInstantOption.get()) <= 0)).collect(Collectors.toList());
HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline();
// This assumes that the archived timeline only has completed instants so this is safe
archivedTimeline.loadCompletedInstantDetailsInMemory();
List<ImmutablePair<String, List<String>>> instantFilesToRemove = context.parallelize(instantTimesToRepair).map(instantToRepair -> new ImmutablePair<>(instantToRepair, RepairUtils.findInstantFilesToRemove(instantToRepair, instantToFilesMap.get(instantToRepair), activeTimeline, archivedTimeline))).collectAsList();
List<ImmutablePair<String, List<String>>> instantsWithDanglingFiles = instantFilesToRemove.stream().filter(e -> !e.getValue().isEmpty()).collect(Collectors.toList());
printRepairInfo(instantTimesToRepair, instantsWithDanglingFiles);
if (!isDryRun) {
List<String> relativeFilePathsToDelete = instantsWithDanglingFiles.stream().flatMap(e -> e.getValue().stream()).collect(Collectors.toList());
if (relativeFilePathsToDelete.size() > 0) {
if (!backupFiles(relativeFilePathsToDelete)) {
LOG.error("Error backing up dangling files. Exiting...");
return false;
}
return deleteFiles(context, cfg.basePath, relativeFilePathsToDelete);
}
LOG.info(String.format("Table repair on %s is successful", cfg.basePath));
}
return true;
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class KafkaOffsetGen method getNextOffsetRanges.
public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit, HoodieDeltaStreamerMetrics metrics) {
// Obtain current metadata for the topic
Map<TopicPartition, Long> fromOffsets;
Map<TopicPartition, Long> toOffsets;
try (KafkaConsumer consumer = new KafkaConsumer(kafkaParams)) {
if (!checkTopicExists(consumer)) {
throw new HoodieException("Kafka topic:" + topicName + " does not exist");
}
List<PartitionInfo> partitionInfoList;
partitionInfoList = consumer.partitionsFor(topicName);
Set<TopicPartition> topicPartitions = partitionInfoList.stream().map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet());
if (Config.KAFKA_CHECKPOINT_TYPE_TIMESTAMP.equals(kafkaCheckpointType) && isValidTimestampCheckpointType(lastCheckpointStr)) {
lastCheckpointStr = getOffsetsByTimestamp(consumer, partitionInfoList, topicPartitions, topicName, Long.parseLong(lastCheckpointStr.get()));
}
// Determine the offset ranges to read from
if (lastCheckpointStr.isPresent() && !lastCheckpointStr.get().isEmpty() && checkTopicCheckpoint(lastCheckpointStr)) {
fromOffsets = fetchValidOffsets(consumer, lastCheckpointStr, topicPartitions);
metrics.updateDeltaStreamerKafkaDelayCountMetrics(delayOffsetCalculation(lastCheckpointStr, topicPartitions, consumer));
} else {
switch(autoResetValue) {
case EARLIEST:
fromOffsets = consumer.beginningOffsets(topicPartitions);
break;
case LATEST:
fromOffsets = consumer.endOffsets(topicPartitions);
break;
case GROUP:
fromOffsets = getGroupOffsets(consumer, topicPartitions);
break;
default:
throw new HoodieNotSupportedException("Auto reset value must be one of 'earliest' or 'latest' or 'group' ");
}
}
// Obtain the latest offsets.
toOffsets = consumer.endOffsets(topicPartitions);
}
// Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.key(), Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.defaultValue());
long numEvents;
if (sourceLimit == Long.MAX_VALUE) {
numEvents = maxEventsToReadFromKafka;
LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka);
} else {
numEvents = sourceLimit;
}
if (numEvents < toOffsets.size()) {
throw new HoodieException("sourceLimit should not be less than the number of kafka partitions");
}
return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class HiveIncrPullSource method fetchNewData.
@Override
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
try {
// find the source commit to pull
Option<String> commitToPull = findCommitToPull(lastCheckpointStr);
if (!commitToPull.isPresent()) {
return new InputBatch<>(Option.empty(), lastCheckpointStr.isPresent() ? lastCheckpointStr.get() : "");
}
// read the files out.
List<FileStatus> commitDeltaFiles = Arrays.asList(fs.listStatus(new Path(incrPullRootPath, commitToPull.get())));
String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch new data");
return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))), String.valueOf(commitToPull.get()));
} catch (IOException ioe) {
throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
}
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class BaseHoodieWriteClient method tryUpgrade.
private void tryUpgrade(HoodieTableMetaClient metaClient, Option<String> instantTime) {
UpgradeDowngrade upgradeDowngrade = new UpgradeDowngrade(metaClient, config, context, upgradeDowngradeHelper);
if (upgradeDowngrade.needsUpgradeOrDowngrade(HoodieTableVersion.current())) {
// Ensure no inflight commits by setting EAGER policy and explicitly cleaning all failed commits
List<String> instantsToRollback = getInstantsToRollback(metaClient, HoodieFailedWritesCleaningPolicy.EAGER, instantTime);
Map<String, Option<HoodiePendingRollbackInfo>> pendingRollbacks = getPendingRollbackInfos(metaClient);
instantsToRollback.forEach(entry -> pendingRollbacks.putIfAbsent(entry, Option.empty()));
rollbackFailedWrites(pendingRollbacks, true);
new UpgradeDowngrade(metaClient, config, context, upgradeDowngradeHelper).run(HoodieTableVersion.current(), instantTime.orElse(null));
metaClient.reloadActiveTimeline();
}
}
Aggregations