Search in sources :

Example 16 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class KafkaOffsetGen method getNextOffsetRanges.

public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit, HoodieDeltaStreamerMetrics metrics) {
    // Obtain current metadata for the topic
    Map<TopicPartition, Long> fromOffsets;
    Map<TopicPartition, Long> toOffsets;
    try (KafkaConsumer consumer = new KafkaConsumer(kafkaParams)) {
        if (!checkTopicExists(consumer)) {
            throw new HoodieException("Kafka topic:" + topicName + " does not exist");
        }
        List<PartitionInfo> partitionInfoList;
        partitionInfoList = consumer.partitionsFor(topicName);
        Set<TopicPartition> topicPartitions = partitionInfoList.stream().map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet());
        if (Config.KAFKA_CHECKPOINT_TYPE_TIMESTAMP.equals(kafkaCheckpointType) && isValidTimestampCheckpointType(lastCheckpointStr)) {
            lastCheckpointStr = getOffsetsByTimestamp(consumer, partitionInfoList, topicPartitions, topicName, Long.parseLong(lastCheckpointStr.get()));
        }
        // Determine the offset ranges to read from
        if (lastCheckpointStr.isPresent() && !lastCheckpointStr.get().isEmpty() && checkTopicCheckpoint(lastCheckpointStr)) {
            fromOffsets = fetchValidOffsets(consumer, lastCheckpointStr, topicPartitions);
            metrics.updateDeltaStreamerKafkaDelayCountMetrics(delayOffsetCalculation(lastCheckpointStr, topicPartitions, consumer));
        } else {
            switch(autoResetValue) {
                case EARLIEST:
                    fromOffsets = consumer.beginningOffsets(topicPartitions);
                    break;
                case LATEST:
                    fromOffsets = consumer.endOffsets(topicPartitions);
                    break;
                case GROUP:
                    fromOffsets = getGroupOffsets(consumer, topicPartitions);
                    break;
                default:
                    throw new HoodieNotSupportedException("Auto reset value must be one of 'earliest' or 'latest' or 'group' ");
            }
        }
        // Obtain the latest offsets.
        toOffsets = consumer.endOffsets(topicPartitions);
    }
    // Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
    long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.key(), Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.defaultValue());
    long numEvents;
    if (sourceLimit == Long.MAX_VALUE) {
        numEvents = maxEventsToReadFromKafka;
        LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka);
    } else {
        numEvents = sourceLimit;
    }
    if (numEvents < toOffsets.size()) {
        throw new HoodieException("sourceLimit should not be less than the number of kafka partitions");
    }
    return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
}
Also used : Arrays(java.util.Arrays) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) DataSourceUtils(org.apache.hudi.DataSourceUtils) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) Function(java.util.function.Function) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) Map(java.util.Map) AvroKafkaSource(org.apache.hudi.utilities.sources.AvroKafkaSource) HoodieDeltaStreamerException(org.apache.hudi.utilities.exception.HoodieDeltaStreamerException) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) TopicPartition(org.apache.kafka.common.TopicPartition) TimeoutException(org.apache.kafka.common.errors.TimeoutException) TypedProperties(org.apache.hudi.common.config.TypedProperties) Set(java.util.Set) ConsumerConfig(org.apache.kafka.clients.consumer.ConsumerConfig) PartitionInfo(org.apache.kafka.common.PartitionInfo) OffsetAndTimestamp(org.apache.kafka.clients.consumer.OffsetAndTimestamp) Collectors(java.util.stream.Collectors) List(java.util.List) ConfigProperty(org.apache.hudi.common.config.ConfigProperty) OffsetAndMetadata(org.apache.kafka.clients.consumer.OffsetAndMetadata) CommitFailedException(org.apache.kafka.clients.consumer.CommitFailedException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) TopicPartition(org.apache.kafka.common.TopicPartition) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) HoodieException(org.apache.hudi.exception.HoodieException) PartitionInfo(org.apache.kafka.common.PartitionInfo) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException)

Example 17 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class S3EventsMetaSelector method createSourceSelector.

/**
 * Factory method for creating custom CloudObjectsMetaSelector. Default selector to use is {@link
 * S3EventsMetaSelector}
 */
public static S3EventsMetaSelector createSourceSelector(TypedProperties props) {
    String sourceSelectorClass = props.getString(S3EventsMetaSelector.Config.SOURCE_INPUT_SELECTOR, S3EventsMetaSelector.class.getName());
    try {
        S3EventsMetaSelector selector = (S3EventsMetaSelector) ReflectionUtils.loadClass(sourceSelectorClass, new Class<?>[] { TypedProperties.class }, props);
        log.info("Using path selector " + selector.getClass().getName());
        return selector;
    } catch (Exception e) {
        throw new HoodieException("Could not load source selector class " + sourceSelectorClass, e);
    }
}
Also used : HoodieException(org.apache.hudi.exception.HoodieException) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) JSONException(org.json.JSONException)

Example 18 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class JdbcSource method checkpoint.

private String checkpoint(Dataset<Row> rowDataset, boolean isIncremental, Option<String> lastCkptStr) {
    try {
        if (isIncremental) {
            Column incrementalColumn = rowDataset.col(props.getString(Config.INCREMENTAL_COLUMN));
            final String max = rowDataset.agg(functions.max(incrementalColumn).cast(DataTypes.StringType)).first().getString(0);
            LOG.info(String.format("Checkpointing column %s with value: %s ", incrementalColumn, max));
            if (max != null) {
                return max;
            }
            return lastCkptStr.isPresent() && !StringUtils.isNullOrEmpty(lastCkptStr.get()) ? lastCkptStr.get() : StringUtils.EMPTY_STRING;
        } else {
            return StringUtils.EMPTY_STRING;
        }
    } catch (Exception e) {
        LOG.error("Failed to checkpoint");
        throw new HoodieException("Failed to checkpoint. Last checkpoint: " + lastCkptStr.orElse(null), e);
    }
}
Also used : Column(org.apache.spark.sql.Column) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException)

Example 19 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class DebeziumSource method fetchNextBatch.

@Override
protected Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {
    String overrideCheckpointStr = props.getString(OVERRIDE_CHECKPOINT_STRING, "");
    OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCkptStr, sourceLimit, metrics);
    long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
    LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
    if (totalNewMsgs == 0) {
        // up to date if a change event has occurred.
        return Pair.of(Option.of(sparkSession.emptyDataFrame()), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
    } else {
        try {
            String schemaStr = schemaRegistryProvider.fetchSchemaFromRegistry(props.getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP));
            Dataset<Row> dataset = toDataset(offsetRanges, offsetGen, schemaStr);
            LOG.info(String.format("Spark schema of Kafka Payload for topic %s:\n%s", offsetGen.getTopicName(), dataset.schema().treeString()));
            LOG.info(String.format("New checkpoint string: %s", CheckpointUtils.offsetsToStr(offsetRanges)));
            return Pair.of(Option.of(dataset), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
        } catch (IOException exc) {
            LOG.error("Fatal error reading and parsing incoming debezium event", exc);
            throw new HoodieException("Fatal error reading and parsing incoming debezium event", exc);
        }
    }
}
Also used : OffsetRange(org.apache.spark.streaming.kafka010.OffsetRange) HoodieException(org.apache.hudi.exception.HoodieException) Row(org.apache.spark.sql.Row) IOException(java.io.IOException)

Example 20 with HoodieException

use of org.apache.hudi.exception.HoodieException in project hudi by apache.

the class HoodieMultiTableDeltaStreamer method populateTableExecutionContextList.

// commonProps are passed as parameter which contain table to config file mapping
private void populateTableExecutionContextList(TypedProperties properties, String configFolder, FileSystem fs, Config config) throws IOException {
    List<String> tablesToBeIngested = getTablesToBeIngested(properties);
    logger.info("tables to be ingested via MultiTableDeltaStreamer : " + tablesToBeIngested);
    TableExecutionContext executionContext;
    for (String table : tablesToBeIngested) {
        String[] tableWithDatabase = table.split("\\.");
        String database = tableWithDatabase.length > 1 ? tableWithDatabase[0] : "default";
        String currentTable = tableWithDatabase.length > 1 ? tableWithDatabase[1] : table;
        String configProp = Constants.INGESTION_PREFIX + database + Constants.DELIMITER + currentTable + Constants.INGESTION_CONFIG_SUFFIX;
        String configFilePath = properties.getString(configProp, Helpers.getDefaultConfigFilePath(configFolder, database, currentTable));
        checkIfTableConfigFileExists(configFolder, fs, configFilePath);
        TypedProperties tableProperties = UtilHelpers.readConfig(fs.getConf(), new Path(configFilePath), new ArrayList<String>()).getProps();
        properties.forEach((k, v) -> {
            if (tableProperties.get(k) == null) {
                tableProperties.setProperty(k.toString(), v.toString());
            }
        });
        final HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
        // copy all the values from config to cfg
        String targetBasePath = resetTarget(config, database, currentTable);
        Helpers.deepCopyConfigs(config, cfg);
        String overriddenTargetBasePath = tableProperties.getString(Constants.TARGET_BASE_PATH_PROP, "");
        cfg.targetBasePath = StringUtils.isNullOrEmpty(overriddenTargetBasePath) ? targetBasePath : overriddenTargetBasePath;
        if (cfg.enableMetaSync && StringUtils.isNullOrEmpty(tableProperties.getString(DataSourceWriteOptions.HIVE_TABLE().key(), ""))) {
            throw new HoodieException("Meta sync table field not provided!");
        }
        populateSchemaProviderProps(cfg, tableProperties);
        executionContext = new TableExecutionContext();
        executionContext.setProperties(tableProperties);
        executionContext.setConfig(cfg);
        executionContext.setDatabase(database);
        executionContext.setTableName(currentTable);
        this.tableExecutionContexts.add(executionContext);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) TypedProperties(org.apache.hudi.common.config.TypedProperties)

Aggregations

HoodieException (org.apache.hudi.exception.HoodieException)171 IOException (java.io.IOException)87 Path (org.apache.hadoop.fs.Path)45 Schema (org.apache.avro.Schema)35 HoodieIOException (org.apache.hudi.exception.HoodieIOException)35 List (java.util.List)30 ArrayList (java.util.ArrayList)27 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Collectors (java.util.stream.Collectors)21 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)19 Option (org.apache.hudi.common.util.Option)19 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)18 Map (java.util.Map)16 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)16 GenericRecord (org.apache.avro.generic.GenericRecord)15 Arrays (java.util.Arrays)14 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)14 Logger (org.apache.log4j.Logger)14 FileStatus (org.apache.hadoop.fs.FileStatus)13 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)13