use of org.apache.hudi.utilities.exception.HoodieSourceTimeoutException in project hudi by apache.
the class DeltaSync method readFromSource.
/**
* Read from Upstream Source and apply transformation if needed.
*
* @param commitTimelineOpt Timeline with completed commits
* @return Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> Input data read from upstream source, consists
* of schemaProvider, checkpointStr and hoodieRecord
* @throws Exception in case of any Exception
*/
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
// Retrieve the previous round checkpoints, if any
Option<String> resumeCheckpointStr = Option.empty();
if (commitTimelineOpt.isPresent()) {
resumeCheckpointStr = getCheckpointToResume(commitTimelineOpt);
} else {
// initialize the table for the first time.
String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
HoodieTableMetaClient.withPropertyBuilder().setTableType(cfg.tableType).setTableName(cfg.targetTableName).setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()).setPayloadClassName(cfg.payloadClassName).setBaseFileFormat(cfg.baseFileFormat).setPartitionFields(partitionColumns).setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())).setPopulateMetaFields(props.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())).setKeyGeneratorClassProp(props.getProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), SimpleKeyGenerator.class.getName())).initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath);
}
LOG.debug("Checkpoint from config: " + cfg.checkpoint);
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
resumeCheckpointStr = Option.of(cfg.checkpoint);
}
LOG.info("Checkpoint to resume from : " + resumeCheckpointStr);
int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1;
int curRetryCount = 0;
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> sourceDataToSync = null;
while (curRetryCount++ < maxRetryCount && sourceDataToSync == null) {
try {
sourceDataToSync = fetchFromSource(resumeCheckpointStr);
} catch (HoodieSourceTimeoutException e) {
if (curRetryCount >= maxRetryCount) {
throw e;
}
try {
LOG.error("Exception thrown while fetching data from source. Msg : " + e.getMessage() + ", class : " + e.getClass() + ", cause : " + e.getCause());
LOG.error("Sleeping for " + (cfg.retryIntervalSecs) + " before retrying again. Current retry count " + curRetryCount + ", max retry count " + cfg.maxRetryCount);
Thread.sleep(cfg.retryIntervalSecs * 1000);
} catch (InterruptedException ex) {
LOG.error("Ignoring InterruptedException while waiting to retry on source failure " + e.getMessage());
}
}
}
return sourceDataToSync;
}
use of org.apache.hudi.utilities.exception.HoodieSourceTimeoutException in project hudi by apache.
the class AvroKafkaSource method fetchNewData.
@Override
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
try {
OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics);
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
if (totalNewMsgs <= 0) {
return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges));
}
JavaRDD<GenericRecord> newDataRDD = toRDD(offsetRanges);
return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
} catch (org.apache.kafka.common.errors.TimeoutException e) {
throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage());
}
}
use of org.apache.hudi.utilities.exception.HoodieSourceTimeoutException in project hudi by apache.
the class JsonKafkaSource method fetchNewData.
@Override
protected InputBatch<JavaRDD<String>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
try {
OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics);
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
if (totalNewMsgs <= 0) {
return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges));
}
JavaRDD<String> newDataRDD = toRDD(offsetRanges);
return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
} catch (org.apache.kafka.common.errors.TimeoutException e) {
throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage());
}
}
Aggregations