use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class DeltaSync method readFromSource.
/**
* Read from Upstream Source and apply transformation if needed.
*
* @param commitTimelineOpt Timeline with completed commits
* @return Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> Input data read from upstream source, consists
* of schemaProvider, checkpointStr and hoodieRecord
* @throws Exception in case of any Exception
*/
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
// Retrieve the previous round checkpoints, if any
Option<String> resumeCheckpointStr = Option.empty();
if (commitTimelineOpt.isPresent()) {
resumeCheckpointStr = getCheckpointToResume(commitTimelineOpt);
} else {
// initialize the table for the first time.
String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
HoodieTableMetaClient.withPropertyBuilder().setTableType(cfg.tableType).setTableName(cfg.targetTableName).setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()).setPayloadClassName(cfg.payloadClassName).setBaseFileFormat(cfg.baseFileFormat).setPartitionFields(partitionColumns).setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())).setPopulateMetaFields(props.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())).setKeyGeneratorClassProp(props.getProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), SimpleKeyGenerator.class.getName())).initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath);
}
LOG.debug("Checkpoint from config: " + cfg.checkpoint);
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
resumeCheckpointStr = Option.of(cfg.checkpoint);
}
LOG.info("Checkpoint to resume from : " + resumeCheckpointStr);
int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1;
int curRetryCount = 0;
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> sourceDataToSync = null;
while (curRetryCount++ < maxRetryCount && sourceDataToSync == null) {
try {
sourceDataToSync = fetchFromSource(resumeCheckpointStr);
} catch (HoodieSourceTimeoutException e) {
if (curRetryCount >= maxRetryCount) {
throw e;
}
try {
LOG.error("Exception thrown while fetching data from source. Msg : " + e.getMessage() + ", class : " + e.getClass() + ", cause : " + e.getCause());
LOG.error("Sleeping for " + (cfg.retryIntervalSecs) + " before retrying again. Current retry count " + curRetryCount + ", max retry count " + cfg.maxRetryCount);
Thread.sleep(cfg.retryIntervalSecs * 1000);
} catch (InterruptedException ex) {
LOG.error("Ignoring InterruptedException while waiting to retry on source failure " + e.getMessage());
}
}
}
return sourceDataToSync;
}
use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class DeltaSync method syncOnce.
/**
* Run one round of delta sync and return new compaction instant if one got scheduled.
*/
public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException {
Pair<Option<String>, JavaRDD<WriteStatus>> result = null;
Timer.Context overallTimerContext = metrics.getOverallTimerContext();
// Refresh Timeline
refreshTimeline();
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> srcRecordsWithCkpt = readFromSource(commitTimelineOpt);
if (null != srcRecordsWithCkpt) {
// compactor
if (null == writeClient) {
this.schemaProvider = srcRecordsWithCkpt.getKey();
// Setup HoodieWriteClient and compaction now that we decided on schema
setupWriteClient();
} else {
Schema newSourceSchema = srcRecordsWithCkpt.getKey().getSourceSchema();
Schema newTargetSchema = srcRecordsWithCkpt.getKey().getTargetSchema();
if (!(processedSchema.isSchemaPresent(newSourceSchema)) || !(processedSchema.isSchemaPresent(newTargetSchema))) {
LOG.info("Seeing new schema. Source :" + newSourceSchema.toString(true) + ", Target :" + newTargetSchema.toString(true));
// We need to recreate write client with new schema and register them.
reInitWriteClient(newSourceSchema, newTargetSchema);
processedSchema.addSchema(newSourceSchema);
processedSchema.addSchema(newTargetSchema);
}
}
// complete the pending clustering before writing to sink
if (cfg.retryLastPendingInlineClusteringJob && getHoodieClientConfig(this.schemaProvider).inlineClusteringEnabled()) {
Option<String> pendingClusteringInstant = getLastPendingClusteringInstant(allCommitsTimelineOpt);
if (pendingClusteringInstant.isPresent()) {
writeClient.cluster(pendingClusteringInstant.get(), true);
}
}
result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext);
}
metrics.updateDeltaStreamerSyncMetrics(System.currentTimeMillis());
// Clear persistent RDDs
jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist);
return result;
}
use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class TestSchemaPostProcessor method testPostProcessor.
@Test
public void testPostProcessor() throws IOException {
properties.put(Config.SCHEMA_POST_PROCESSOR_PROP, DummySchemaPostProcessor.class.getName());
SchemaProvider provider = UtilHelpers.wrapSchemaProviderWithPostProcessor(UtilHelpers.createSchemaProvider(DummySchemaProvider.class.getName(), properties, jsc), properties, jsc, null);
Schema schema = provider.getSourceSchema();
assertEquals(schema.getType(), Type.RECORD);
assertEquals(schema.getName(), "test");
assertNotNull(schema.getField("testString"));
}
use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class TestSchemaPostProcessor method testSparkAvro.
@Test
public void testSparkAvro() throws IOException {
properties.put(Config.SCHEMA_POST_PROCESSOR_PROP, SparkAvroPostProcessor.class.getName());
List<String> transformerClassNames = new ArrayList<>();
transformerClassNames.add(FlatteningTransformer.class.getName());
SchemaProvider provider = UtilHelpers.wrapSchemaProviderWithPostProcessor(UtilHelpers.createSchemaProvider(SparkAvroSchemaProvider.class.getName(), properties, jsc), properties, jsc, transformerClassNames);
Schema schema = provider.getSourceSchema();
assertEquals(schema.getType(), Type.RECORD);
assertEquals(schema.getName(), "hoodie_source");
assertEquals(schema.getNamespace(), "hoodie.source");
assertNotNull(schema.getField("day"));
}
use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.
the class TestAbstractDebeziumSource method testDebeziumEvents.
@ParameterizedTest
@MethodSource("testArguments")
public void testDebeziumEvents(Operation operation) throws Exception {
String sourceClass = getSourceClass();
// topic setup.
testUtils.createTopic(TEST_TOPIC_NAME, 2);
TypedProperties props = createPropsForJsonSource();
SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this);
SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics));
testUtils.sendMessages(TEST_TOPIC_NAME, new String[] { generateDebeziumEvent(operation).toString() });
InputBatch<Dataset<Row>> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10);
assertEquals(1, fetch.getBatch().get().count());
// Ensure the before fields are picked for DELETE CDC Events,
// and after fields are picked for INSERT and UPDATE CDC Events.
final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_";
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
// Validate DB specific meta fields
validateMetaFields(fetch.getBatch().get());
}
Aggregations