use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class KafkaOffsetGen method getNextOffsetRanges.
public OffsetRange[] getNextOffsetRanges(Option<String> lastCheckpointStr, long sourceLimit, HoodieDeltaStreamerMetrics metrics) {
// Obtain current metadata for the topic
Map<TopicPartition, Long> fromOffsets;
Map<TopicPartition, Long> toOffsets;
try (KafkaConsumer consumer = new KafkaConsumer(kafkaParams)) {
if (!checkTopicExists(consumer)) {
throw new HoodieException("Kafka topic:" + topicName + " does not exist");
}
List<PartitionInfo> partitionInfoList;
partitionInfoList = consumer.partitionsFor(topicName);
Set<TopicPartition> topicPartitions = partitionInfoList.stream().map(x -> new TopicPartition(x.topic(), x.partition())).collect(Collectors.toSet());
if (Config.KAFKA_CHECKPOINT_TYPE_TIMESTAMP.equals(kafkaCheckpointType) && isValidTimestampCheckpointType(lastCheckpointStr)) {
lastCheckpointStr = getOffsetsByTimestamp(consumer, partitionInfoList, topicPartitions, topicName, Long.parseLong(lastCheckpointStr.get()));
}
// Determine the offset ranges to read from
if (lastCheckpointStr.isPresent() && !lastCheckpointStr.get().isEmpty() && checkTopicCheckpoint(lastCheckpointStr)) {
fromOffsets = fetchValidOffsets(consumer, lastCheckpointStr, topicPartitions);
metrics.updateDeltaStreamerKafkaDelayCountMetrics(delayOffsetCalculation(lastCheckpointStr, topicPartitions, consumer));
} else {
switch(autoResetValue) {
case EARLIEST:
fromOffsets = consumer.beginningOffsets(topicPartitions);
break;
case LATEST:
fromOffsets = consumer.endOffsets(topicPartitions);
break;
case GROUP:
fromOffsets = getGroupOffsets(consumer, topicPartitions);
break;
default:
throw new HoodieNotSupportedException("Auto reset value must be one of 'earliest' or 'latest' or 'group' ");
}
}
// Obtain the latest offsets.
toOffsets = consumer.endOffsets(topicPartitions);
}
// Come up with final set of OffsetRanges to read (account for new partitions, limit number of events)
long maxEventsToReadFromKafka = props.getLong(Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.key(), Config.MAX_EVENTS_FROM_KAFKA_SOURCE_PROP.defaultValue());
long numEvents;
if (sourceLimit == Long.MAX_VALUE) {
numEvents = maxEventsToReadFromKafka;
LOG.info("SourceLimit not configured, set numEvents to default value : " + maxEventsToReadFromKafka);
} else {
numEvents = sourceLimit;
}
if (numEvents < toOffsets.size()) {
throw new HoodieException("sourceLimit should not be less than the number of kafka partitions");
}
return CheckpointUtils.computeOffsetRanges(fromOffsets, toOffsets, numEvents);
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class S3EventsMetaSelector method createSourceSelector.
/**
* Factory method for creating custom CloudObjectsMetaSelector. Default selector to use is {@link
* S3EventsMetaSelector}
*/
public static S3EventsMetaSelector createSourceSelector(TypedProperties props) {
String sourceSelectorClass = props.getString(S3EventsMetaSelector.Config.SOURCE_INPUT_SELECTOR, S3EventsMetaSelector.class.getName());
try {
S3EventsMetaSelector selector = (S3EventsMetaSelector) ReflectionUtils.loadClass(sourceSelectorClass, new Class<?>[] { TypedProperties.class }, props);
log.info("Using path selector " + selector.getClass().getName());
return selector;
} catch (Exception e) {
throw new HoodieException("Could not load source selector class " + sourceSelectorClass, e);
}
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class JdbcSource method checkpoint.
private String checkpoint(Dataset<Row> rowDataset, boolean isIncremental, Option<String> lastCkptStr) {
try {
if (isIncremental) {
Column incrementalColumn = rowDataset.col(props.getString(Config.INCREMENTAL_COLUMN));
final String max = rowDataset.agg(functions.max(incrementalColumn).cast(DataTypes.StringType)).first().getString(0);
LOG.info(String.format("Checkpointing column %s with value: %s ", incrementalColumn, max));
if (max != null) {
return max;
}
return lastCkptStr.isPresent() && !StringUtils.isNullOrEmpty(lastCkptStr.get()) ? lastCkptStr.get() : StringUtils.EMPTY_STRING;
} else {
return StringUtils.EMPTY_STRING;
}
} catch (Exception e) {
LOG.error("Failed to checkpoint");
throw new HoodieException("Failed to checkpoint. Last checkpoint: " + lastCkptStr.orElse(null), e);
}
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class DebeziumSource method fetchNextBatch.
@Override
protected Pair<Option<Dataset<Row>>, String> fetchNextBatch(Option<String> lastCkptStr, long sourceLimit) {
String overrideCheckpointStr = props.getString(OVERRIDE_CHECKPOINT_STRING, "");
OffsetRange[] offsetRanges = offsetGen.getNextOffsetRanges(lastCkptStr, sourceLimit, metrics);
long totalNewMsgs = CheckpointUtils.totalNewMessages(offsetRanges);
LOG.info("About to read " + totalNewMsgs + " from Kafka for topic :" + offsetGen.getTopicName());
if (totalNewMsgs == 0) {
// up to date if a change event has occurred.
return Pair.of(Option.of(sparkSession.emptyDataFrame()), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
} else {
try {
String schemaStr = schemaRegistryProvider.fetchSchemaFromRegistry(props.getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP));
Dataset<Row> dataset = toDataset(offsetRanges, offsetGen, schemaStr);
LOG.info(String.format("Spark schema of Kafka Payload for topic %s:\n%s", offsetGen.getTopicName(), dataset.schema().treeString()));
LOG.info(String.format("New checkpoint string: %s", CheckpointUtils.offsetsToStr(offsetRanges)));
return Pair.of(Option.of(dataset), overrideCheckpointStr.isEmpty() ? CheckpointUtils.offsetsToStr(offsetRanges) : overrideCheckpointStr);
} catch (IOException exc) {
LOG.error("Fatal error reading and parsing incoming debezium event", exc);
throw new HoodieException("Fatal error reading and parsing incoming debezium event", exc);
}
}
}
use of org.apache.hudi.exception.HoodieException in project hudi by apache.
the class HoodieMultiTableDeltaStreamer method populateTableExecutionContextList.
// commonProps are passed as parameter which contain table to config file mapping
private void populateTableExecutionContextList(TypedProperties properties, String configFolder, FileSystem fs, Config config) throws IOException {
List<String> tablesToBeIngested = getTablesToBeIngested(properties);
logger.info("tables to be ingested via MultiTableDeltaStreamer : " + tablesToBeIngested);
TableExecutionContext executionContext;
for (String table : tablesToBeIngested) {
String[] tableWithDatabase = table.split("\\.");
String database = tableWithDatabase.length > 1 ? tableWithDatabase[0] : "default";
String currentTable = tableWithDatabase.length > 1 ? tableWithDatabase[1] : table;
String configProp = Constants.INGESTION_PREFIX + database + Constants.DELIMITER + currentTable + Constants.INGESTION_CONFIG_SUFFIX;
String configFilePath = properties.getString(configProp, Helpers.getDefaultConfigFilePath(configFolder, database, currentTable));
checkIfTableConfigFileExists(configFolder, fs, configFilePath);
TypedProperties tableProperties = UtilHelpers.readConfig(fs.getConf(), new Path(configFilePath), new ArrayList<String>()).getProps();
properties.forEach((k, v) -> {
if (tableProperties.get(k) == null) {
tableProperties.setProperty(k.toString(), v.toString());
}
});
final HoodieDeltaStreamer.Config cfg = new HoodieDeltaStreamer.Config();
// copy all the values from config to cfg
String targetBasePath = resetTarget(config, database, currentTable);
Helpers.deepCopyConfigs(config, cfg);
String overriddenTargetBasePath = tableProperties.getString(Constants.TARGET_BASE_PATH_PROP, "");
cfg.targetBasePath = StringUtils.isNullOrEmpty(overriddenTargetBasePath) ? targetBasePath : overriddenTargetBasePath;
if (cfg.enableMetaSync && StringUtils.isNullOrEmpty(tableProperties.getString(DataSourceWriteOptions.HIVE_TABLE().key(), ""))) {
throw new HoodieException("Meta sync table field not provided!");
}
populateSchemaProviderProps(cfg, tableProperties);
executionContext = new TableExecutionContext();
executionContext.setProperties(tableProperties);
executionContext.setConfig(cfg);
executionContext.setDatabase(database);
executionContext.setTableName(currentTable);
this.tableExecutionContexts.add(executionContext);
}
}
Aggregations