Search in sources :

Example 1 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class DeltaSync method readFromSource.

/**
 * Read from Upstream Source and apply transformation if needed.
 *
 * @param commitTimelineOpt Timeline with completed commits
 * @return Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> Input data read from upstream source, consists
 * of schemaProvider, checkpointStr and hoodieRecord
 * @throws Exception in case of any Exception
 */
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> readFromSource(Option<HoodieTimeline> commitTimelineOpt) throws IOException {
    // Retrieve the previous round checkpoints, if any
    Option<String> resumeCheckpointStr = Option.empty();
    if (commitTimelineOpt.isPresent()) {
        resumeCheckpointStr = getCheckpointToResume(commitTimelineOpt);
    } else {
        // initialize the table for the first time.
        String partitionColumns = HoodieSparkUtils.getPartitionColumns(keyGenerator, props);
        HoodieTableMetaClient.withPropertyBuilder().setTableType(cfg.tableType).setTableName(cfg.targetTableName).setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue()).setPayloadClassName(cfg.payloadClassName).setBaseFileFormat(cfg.baseFileFormat).setPartitionFields(partitionColumns).setRecordKeyFields(props.getProperty(DataSourceWriteOptions.RECORDKEY_FIELD().key())).setPopulateMetaFields(props.getBoolean(HoodieTableConfig.POPULATE_META_FIELDS.key(), HoodieTableConfig.POPULATE_META_FIELDS.defaultValue())).setKeyGeneratorClassProp(props.getProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), SimpleKeyGenerator.class.getName())).initTable(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath);
    }
    LOG.debug("Checkpoint from config: " + cfg.checkpoint);
    if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
        resumeCheckpointStr = Option.of(cfg.checkpoint);
    }
    LOG.info("Checkpoint to resume from : " + resumeCheckpointStr);
    int maxRetryCount = cfg.retryOnSourceFailures ? cfg.maxRetryCount : 1;
    int curRetryCount = 0;
    Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> sourceDataToSync = null;
    while (curRetryCount++ < maxRetryCount && sourceDataToSync == null) {
        try {
            sourceDataToSync = fetchFromSource(resumeCheckpointStr);
        } catch (HoodieSourceTimeoutException e) {
            if (curRetryCount >= maxRetryCount) {
                throw e;
            }
            try {
                LOG.error("Exception thrown while fetching data from source. Msg : " + e.getMessage() + ", class : " + e.getClass() + ", cause : " + e.getCause());
                LOG.error("Sleeping for " + (cfg.retryIntervalSecs) + " before retrying again. Current retry count " + curRetryCount + ", max retry count " + cfg.maxRetryCount);
                Thread.sleep(cfg.retryIntervalSecs * 1000);
            } catch (InterruptedException ex) {
                LOG.error("Ignoring InterruptedException while waiting to retry on source failure " + e.getMessage());
            }
        }
    }
    return sourceDataToSync;
}
Also used : HoodieSourceTimeoutException(org.apache.hudi.utilities.exception.HoodieSourceTimeoutException) Configuration(org.apache.hadoop.conf.Configuration) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) Pair(org.apache.hudi.common.util.collection.Pair)

Example 2 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class DeltaSync method syncOnce.

/**
 * Run one round of delta sync and return new compaction instant if one got scheduled.
 */
public Pair<Option<String>, JavaRDD<WriteStatus>> syncOnce() throws IOException {
    Pair<Option<String>, JavaRDD<WriteStatus>> result = null;
    Timer.Context overallTimerContext = metrics.getOverallTimerContext();
    // Refresh Timeline
    refreshTimeline();
    Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> srcRecordsWithCkpt = readFromSource(commitTimelineOpt);
    if (null != srcRecordsWithCkpt) {
        // compactor
        if (null == writeClient) {
            this.schemaProvider = srcRecordsWithCkpt.getKey();
            // Setup HoodieWriteClient and compaction now that we decided on schema
            setupWriteClient();
        } else {
            Schema newSourceSchema = srcRecordsWithCkpt.getKey().getSourceSchema();
            Schema newTargetSchema = srcRecordsWithCkpt.getKey().getTargetSchema();
            if (!(processedSchema.isSchemaPresent(newSourceSchema)) || !(processedSchema.isSchemaPresent(newTargetSchema))) {
                LOG.info("Seeing new schema. Source :" + newSourceSchema.toString(true) + ", Target :" + newTargetSchema.toString(true));
                // We need to recreate write client with new schema and register them.
                reInitWriteClient(newSourceSchema, newTargetSchema);
                processedSchema.addSchema(newSourceSchema);
                processedSchema.addSchema(newTargetSchema);
            }
        }
        // complete the pending clustering before writing to sink
        if (cfg.retryLastPendingInlineClusteringJob && getHoodieClientConfig(this.schemaProvider).inlineClusteringEnabled()) {
            Option<String> pendingClusteringInstant = getLastPendingClusteringInstant(allCommitsTimelineOpt);
            if (pendingClusteringInstant.isPresent()) {
                writeClient.cluster(pendingClusteringInstant.get(), true);
            }
        }
        result = writeToSink(srcRecordsWithCkpt.getRight().getRight(), srcRecordsWithCkpt.getRight().getLeft(), metrics, overallTimerContext);
    }
    metrics.updateDeltaStreamerSyncMetrics(System.currentTimeMillis());
    // Clear persistent RDDs
    jssc.getPersistentRDDs().values().forEach(JavaRDD::unpersist);
    return result;
}
Also used : Timer(com.codahale.metrics.Timer) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) DelegatingSchemaProvider(org.apache.hudi.utilities.schema.DelegatingSchemaProvider) Option(org.apache.hudi.common.util.Option) JavaRDD(org.apache.spark.api.java.JavaRDD) Pair(org.apache.hudi.common.util.collection.Pair)

Example 3 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class TestSchemaPostProcessor method testPostProcessor.

@Test
public void testPostProcessor() throws IOException {
    properties.put(Config.SCHEMA_POST_PROCESSOR_PROP, DummySchemaPostProcessor.class.getName());
    SchemaProvider provider = UtilHelpers.wrapSchemaProviderWithPostProcessor(UtilHelpers.createSchemaProvider(DummySchemaProvider.class.getName(), properties, jsc), properties, jsc, null);
    Schema schema = provider.getSourceSchema();
    assertEquals(schema.getType(), Type.RECORD);
    assertEquals(schema.getName(), "test");
    assertNotNull(schema.getField("testString"));
}
Also used : Schema(org.apache.avro.Schema) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) Test(org.junit.jupiter.api.Test)

Example 4 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class TestSchemaPostProcessor method testSparkAvro.

@Test
public void testSparkAvro() throws IOException {
    properties.put(Config.SCHEMA_POST_PROCESSOR_PROP, SparkAvroPostProcessor.class.getName());
    List<String> transformerClassNames = new ArrayList<>();
    transformerClassNames.add(FlatteningTransformer.class.getName());
    SchemaProvider provider = UtilHelpers.wrapSchemaProviderWithPostProcessor(UtilHelpers.createSchemaProvider(SparkAvroSchemaProvider.class.getName(), properties, jsc), properties, jsc, transformerClassNames);
    Schema schema = provider.getSourceSchema();
    assertEquals(schema.getType(), Type.RECORD);
    assertEquals(schema.getName(), "hoodie_source");
    assertEquals(schema.getNamespace(), "hoodie.source");
    assertNotNull(schema.getField("day"));
}
Also used : FlatteningTransformer(org.apache.hudi.utilities.transform.FlatteningTransformer) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) SparkAvroPostProcessor(org.apache.hudi.utilities.schema.SparkAvroPostProcessor) Test(org.junit.jupiter.api.Test)

Example 5 with SchemaProvider

use of org.apache.hudi.utilities.schema.SchemaProvider in project hudi by apache.

the class TestAbstractDebeziumSource method testDebeziumEvents.

@ParameterizedTest
@MethodSource("testArguments")
public void testDebeziumEvents(Operation operation) throws Exception {
    String sourceClass = getSourceClass();
    // topic setup.
    testUtils.createTopic(TEST_TOPIC_NAME, 2);
    TypedProperties props = createPropsForJsonSource();
    SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this);
    SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics));
    testUtils.sendMessages(TEST_TOPIC_NAME, new String[] { generateDebeziumEvent(operation).toString() });
    InputBatch<Dataset<Row>> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10);
    assertEquals(1, fetch.getBatch().get().count());
    // Ensure the before fields are picked for DELETE CDC Events,
    // and after fields are picked for INSERT and UPDATE CDC Events.
    final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_";
    assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
    assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
    // Validate DB specific meta fields
    validateMetaFields(fetch.getBatch().get());
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) Dataset(org.apache.spark.sql.Dataset) SchemaRegistryProvider(org.apache.hudi.utilities.schema.SchemaRegistryProvider) DebeziumConstants(org.apache.hudi.common.model.debezium.DebeziumConstants) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) GenericData(org.apache.avro.generic.GenericData) AfterAll(org.junit.jupiter.api.AfterAll) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) BeforeAll(org.junit.jupiter.api.BeforeAll) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) MethodSource(org.junit.jupiter.params.provider.MethodSource) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) TypedProperties(org.apache.hudi.common.config.TypedProperties) UtilHelpers(org.apache.hudi.utilities.UtilHelpers) UtilitiesTestBase(org.apache.hudi.utilities.testutils.UtilitiesTestBase) ConsumerConfig(org.apache.kafka.clients.consumer.ConsumerConfig) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) KafkaTestUtils(org.apache.spark.streaming.kafka010.KafkaTestUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) InputBatch(org.apache.hudi.utilities.sources.InputBatch) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) Mockito.mock(org.mockito.Mockito.mock) Dataset(org.apache.spark.sql.Dataset) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) TypedProperties(org.apache.hudi.common.config.TypedProperties) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

SchemaProvider (org.apache.hudi.utilities.schema.SchemaProvider)9 Schema (org.apache.avro.Schema)6 DelegatingSchemaProvider (org.apache.hudi.utilities.schema.DelegatingSchemaProvider)4 Test (org.junit.jupiter.api.Test)4 IOException (java.io.IOException)3 TypedProperties (org.apache.hudi.common.config.TypedProperties)3 Option (org.apache.hudi.common.util.Option)3 RowBasedSchemaProvider (org.apache.hudi.utilities.schema.RowBasedSchemaProvider)3 Timer (com.codahale.metrics.Timer)2 ArrayList (java.util.ArrayList)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Configuration (org.apache.hadoop.conf.Configuration)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 Pair (org.apache.hudi.common.util.collection.Pair)2 HoodieIOException (org.apache.hudi.exception.HoodieIOException)2 UtilHelpers (org.apache.hudi.utilities.UtilHelpers)2 InputBatch (org.apache.hudi.utilities.sources.InputBatch)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 Dataset (org.apache.spark.sql.Dataset)2