Search in sources :

Example 36 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestHoodieDeltaStreamer method testProps.

@Test
public void testProps() {
    TypedProperties props = new DFSPropertiesConfiguration(dfs.getConf(), new Path(dfsBasePath + "/" + PROPS_FILENAME_TEST_SOURCE)).getProps();
    assertEquals(2, props.getInteger("hoodie.upsert.shuffle.parallelism"));
    assertEquals("_row_key", props.getString("hoodie.datasource.write.recordkey.field"));
    assertEquals("org.apache.hudi.utilities.functional.TestHoodieDeltaStreamer$TestGenerator", props.getString("hoodie.datasource.write.keygenerator.class"));
}
Also used : Path(org.apache.hadoop.fs.Path) TypedProperties(org.apache.hudi.common.config.TypedProperties) DFSPropertiesConfiguration(org.apache.hudi.common.config.DFSPropertiesConfiguration) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 37 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestHoodieDeltaStreamer method prepareCsvDFSSource.

private void prepareCsvDFSSource(boolean hasHeader, char sep, boolean useSchemaProvider, boolean hasTransformer) throws IOException {
    String sourceRoot = dfsBasePath + "/csvFiles";
    String recordKeyField = (hasHeader || useSchemaProvider) ? "_row_key" : "_c0";
    // Properties used for testing delta-streamer with CSV source
    TypedProperties csvProps = new TypedProperties();
    csvProps.setProperty("include", "base.properties");
    csvProps.setProperty("hoodie.datasource.write.recordkey.field", recordKeyField);
    csvProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
    if (useSchemaProvider) {
        csvProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source-flattened.avsc");
        if (hasTransformer) {
            csvProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target-flattened.avsc");
        }
    }
    csvProps.setProperty("hoodie.deltastreamer.source.dfs.root", sourceRoot);
    if (sep != ',') {
        if (sep == '\t') {
            csvProps.setProperty("hoodie.deltastreamer.csv.sep", "\\t");
        } else {
            csvProps.setProperty("hoodie.deltastreamer.csv.sep", Character.toString(sep));
        }
    }
    if (hasHeader) {
        csvProps.setProperty("hoodie.deltastreamer.csv.header", Boolean.toString(hasHeader));
    }
    UtilitiesTestBase.Helpers.savePropsToDFS(csvProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_CSV);
    String path = sourceRoot + "/1.csv";
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    UtilitiesTestBase.Helpers.saveCsvToDFS(hasHeader, sep, Helpers.jsonifyRecords(dataGenerator.generateInserts("000", CSV_NUM_RECORDS, true)), dfs, path);
}
Also used : TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 38 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestHoodieDeltaStreamerWithMultiWriter method testUpsertsContinuousModeWithMultipleWritersWithoutConflicts.

@ParameterizedTest
@EnumSource(HoodieTableType.class)
void testUpsertsContinuousModeWithMultipleWritersWithoutConflicts(HoodieTableType tableType) throws Exception {
    // NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts
    basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString();
    propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER;
    tableBasePath = basePath + "/testtable_" + tableType;
    prepareInitialConfigs(fs(), basePath, "foo");
    TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
    props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
    props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
    // Keep it higher than batch-size to test continuous mode
    int totalRecords = 3000;
    HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    prepJobConfig.continuousMode = true;
    prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc());
    // Prepare base dataset with some commits
    deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> {
        if (tableType.equals(HoodieTableType.MERGE_ON_READ)) {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs());
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs());
        } else {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs());
        }
        TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        return true;
    });
    // create new ingestion & backfill job config to generate only INSERTS to avoid conflict
    props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
    props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
    props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    props.setProperty("hoodie.test.source.generate.inserts", "true");
    UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER);
    HoodieDeltaStreamer.Config cfgBackfillJob2 = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.INSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TestIdentityTransformer.class.getName()));
    cfgBackfillJob2.continuousMode = false;
    HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build();
    HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
    HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class);
    cfgBackfillJob2.checkpoint = commitMetadata.getMetadata(CHECKPOINT_KEY);
    cfgBackfillJob2.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    cfgBackfillJob2.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer.Config cfgIngestionJob2 = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TestIdentityTransformer.class.getName()));
    cfgIngestionJob2.continuousMode = true;
    cfgIngestionJob2.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    cfgIngestionJob2.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    // re-init ingestion job
    HoodieDeltaStreamer ingestionJob3 = new HoodieDeltaStreamer(cfgIngestionJob2, jsc());
    // re-init backfill job
    HoodieDeltaStreamer backfillJob2 = new HoodieDeltaStreamer(cfgBackfillJob2, jsc());
    // run ingestion & backfill in parallel, avoid conflict and succeed both
    runJobsInParallel(tableBasePath, tableType, totalRecords, ingestionJob3, cfgIngestionJob2, backfillJob2, cfgBackfillJob2, false, "batch2");
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 39 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestHoodieDeltaStreamerWithMultiWriter method testLatestCheckpointCarryOverWithMultipleWriters.

@Disabled
@ParameterizedTest
@EnumSource(value = HoodieTableType.class, names = { "COPY_ON_WRITE" })
void testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType tableType) throws Exception {
    // NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts
    basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString();
    propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER;
    tableBasePath = basePath + "/testtable_" + tableType;
    prepareInitialConfigs(fs(), basePath, "foo");
    TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
    props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
    props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
    // Keep it higher than batch-size to test continuous mode
    int totalRecords = 3000;
    HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    prepJobConfig.continuousMode = true;
    prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc());
    // Prepare base dataset with some commits
    deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> {
        if (tableType.equals(HoodieTableType.MERGE_ON_READ)) {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs());
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs());
        } else {
            TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs());
        }
        TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
        return true;
    });
    // create a backfill job with checkpoint from the first instant
    HoodieDeltaStreamer.Config cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
    cfgBackfillJob.continuousMode = false;
    HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build();
    HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
    HoodieCommitMetadata commitMetadataForFirstInstant = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class);
    // run the backfill job
    props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
    props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
    props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
    UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
    // get current checkpoint after preparing base dataset with some commits
    HoodieCommitMetadata commitMetadataForLastInstant = getLatestMetadata(meta);
    // Set checkpoint to the last successful position
    cfgBackfillJob.checkpoint = commitMetadataForLastInstant.getMetadata(CHECKPOINT_KEY);
    cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc());
    backfillJob.sync();
    meta.reloadActiveTimeline();
    int totalCommits = meta.getCommitsTimeline().filterCompletedInstants().countInstants();
    // add a new commit to timeline which may not have the checkpoint in extra metadata
    addCommitToTimeline(meta);
    meta.reloadActiveTimeline();
    verifyCommitMetadataCheckpoint(meta, null);
    cfgBackfillJob.checkpoint = null;
    // if deltastreamer checkpoint fetch does not walk back to older commits, this sync will fail
    new HoodieDeltaStreamer(cfgBackfillJob, jsc()).sync();
    meta.reloadActiveTimeline();
    Assertions.assertEquals(totalCommits + 2, meta.getCommitsTimeline().filterCompletedInstants().countInstants());
    verifyCommitMetadataCheckpoint(meta, "00008");
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Disabled(org.junit.jupiter.api.Disabled)

Example 40 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithKafkaSource.

// 0 corresponds to fg
@Test
public void testMultiTableExecutionWithKafkaSource() throws IOException {
    // create topics for each table
    String topicName1 = "topic" + testNum++;
    String topicName2 = "topic" + testNum;
    testUtils.createTopic(topicName1, 2);
    testUtils.createTopic(topicName2, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false, null);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
    TypedProperties properties = executionContexts.get(1).getProperties();
    properties.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName2);
    executionContexts.get(1).setProperties(properties);
    TypedProperties properties1 = executionContexts.get(0).getProperties();
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName1);
    executionContexts.get(0).setProperties(properties1);
    String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
    String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
    streamer.sync();
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    // insert updates for already existing records in kafka topics
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    streamer.getTableExecutionContexts().get(1).setProperties(properties);
    streamer.getTableExecutionContexts().get(0).setProperties(properties1);
    streamer.sync();
    assertEquals(2, streamer.getSuccessTables().size());
    assertTrue(streamer.getFailedTables().isEmpty());
    // assert the record count matches now
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    testNum++;
}
Also used : JsonKafkaSource(org.apache.hudi.utilities.sources.JsonKafkaSource) HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Aggregations

TypedProperties (org.apache.hudi.common.config.TypedProperties)143 Test (org.junit.jupiter.api.Test)47 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)22 JavaRDD (org.apache.spark.api.java.JavaRDD)16 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 Path (org.apache.hadoop.fs.Path)14 Properties (java.util.Properties)13 GenericRecord (org.apache.avro.generic.GenericRecord)13 SourceFormatAdapter (org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter)12 Row (org.apache.spark.sql.Row)12 BeforeEach (org.junit.jupiter.api.BeforeEach)11 ArrayList (java.util.ArrayList)10 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)10 HoodieKey (org.apache.hudi.common.model.HoodieKey)9 DFSPropertiesConfiguration (org.apache.hudi.common.config.DFSPropertiesConfiguration)8 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)8 Dataset (org.apache.spark.sql.Dataset)8 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)7