Search in sources :

Example 1 with HoodieSourceTimeoutException

use of org.apache.hudi.utilities.exception.HoodieSourceTimeoutException in project hudi by apache.

the class DeltaSync method readFromSource.

/**
 * Read from Upstream Source and apply transformation if needed.
 *
 * @param commitTimelineOpt Timeline with completed commits
 * @return Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> Input data read from upstream source, consists
 * of schemaProvider, checkpointStr and hoodieRecord
 * @throws Exception in case of any Exception
 */
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
    // Retrieve the previous round checkpoints, if any
    Option<String> resumeCheckpointStr = Option.empty();
    if (commitTimelineOpt.isPresent()) {
        resumeCheckpointStr = getCheckpointToResume(commitTimelineOpt);
    } else {
        // initialize the table for the first time.
        String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
        HoodieTableMetaClient.withPropertyBuilder().setTableType(cfg.tableType).setTableName(cfg.targetTableName).setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()).setPayloadClassName(cfg.payloadClassName).setBaseFileFormat(cfg.baseFileFormat).setPartitionFields(partitionColumns).setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())).setPopulateMetaFields(props.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())).setKeyGeneratorClassProp(props.getProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), SimpleKeyGenerator.class.getName())).initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath);
    }
    LOG.debug("Checkpoint from config: " + cfg.checkpoint);
    if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
        resumeCheckpointStr = Option.of(cfg.checkpoint);
    }
    LOG.info("Checkpoint to resume from : " + resumeCheckpointStr);
    int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1;
    int curRetryCount = 0;
    Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> sourceDataToSync = null;
    while (curRetryCount++ < maxRetryCount && sourceDataToSync == null) {
        try {
            sourceDataToSync = fetchFromSource(resumeCheckpointStr);
        } catch (HoodieSourceTimeoutException e) {
            if (curRetryCount >= maxRetryCount) {
                throw e;
            }
            try {
                LOG.error("Exception thrown while fetching data from source. Msg : " + e.getMessage() + ", class : " + e.getClass() + ", cause : " + e.getCause());
                LOG.error("Sleeping for " + (cfg.retryIntervalSecs) + " before retrying again. Current retry count " + curRetryCount + ", max retry count " + cfg.maxRetryCount);
                Thread.sleep(cfg.retryIntervalSecs * 1000);
            } catch (InterruptedException ex) {
                LOG.error("Ignoring InterruptedException while waiting to retry on source failure " + e.getMessage());
            }
        }
    }
    return sourceDataToSync;
}
Also used : HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException) Configuration(org.apache.hadoop.conf.Configuration) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) Pair(org.apache.hudi.common.util.collection.Pair)

Example 2 with HoodieSourceTimeoutException

use of org.apache.hudi.utilities.exception.HoodieSourceTimeoutException in project hudi by apache.

the class AvroKafkaSource method fetchNewData.

@Override
protected InputBatch<JavaRDD<GenericRecord>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
    try {
        OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics);
        long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
        LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
        if (totalNewMsgs <= 0) {
            return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges));
        }
        JavaRDD<GenericRecord> newDataRDD = toRDD(offsetRanges);
        return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
    } catch (org.apache.kafka.common.errors.TimeoutException e) {
        throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage());
    }
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 3 with HoodieSourceTimeoutException

use of org.apache.hudi.utilities.exception.HoodieSourceTimeoutException in project hudi by apache.

the class JsonKafkaSource method fetchNewData.

@Override
protected InputBatch<JavaRDD<String>> fetchNewData(Option<String> lastCheckpointStr, long sourceLimit) {
    try {
        OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCheckpointStr, sourceLimit, metrics);
        long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
        LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
        if (totalNewMsgs <= 0) {
            return new InputBatch<>(Option.empty(), CheckpointUtils.offsetsToStr(offsetRanges));
        }
        JavaRDD<String> newDataRDD = toRDD(offsetRanges);
        return new InputBatch<>(Option.of(newDataRDD), CheckpointUtils.offsetsToStr(offsetRanges));
    } catch (org.apache.kafka.common.errors.TimeoutException e) {
        throw new HoodieSourceTimeoutException("Kafka Source timed out " + e.getMessage());
    }
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException)

Aggregations

HoodieSourceTimeoutException (org.apache.hudi.utilities.exception.HoodieSourceTimeoutException)3 OffsetRange (org.apache.spark.streaming.kafka010.OffsetRange)2 GenericRecord (org.apache.avro.generic.GenericRecord)1 Configuration (org.apache.hadoop.conf.Configuration)1 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)1 Pair (org.apache.hudi.common.util.collection.Pair)1 SimpleKeyGenerator (org.apache.hudi.keygen.SimpleKeyGenerator)1 DelegatingSchemaProvider (org.apache.hudi.utilities.schema.DelegatingSchemaProvider)1 SchemaProvider (org.apache.hudi.utilities.schema.SchemaProvider)1