Examples with HoodieDeltaStreamer - org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer

Example 6 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testHoodieAsyncClusteringJob.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testHoodieAsyncClusteringJob(boolean shouldPassInClusteringInstantTime) throws Exception {
    String tableBasePath = dfsBasePath + "/asyncClusteringJob";
    HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "true");
    deltaStreamerTestRunner(ds, (r) -> {
        TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
        Option<String> scheduleClusteringInstantTime = Option.empty();
        try {
            HoodieClusteringJob scheduleClusteringJob = initialHoodieClusteringJob(tableBasePath, null, true, null);
            scheduleClusteringInstantTime = scheduleClusteringJob.doSchedule();
        } catch (Exception e) {
            LOG.warn("Schedule clustering failed", e);
            return false;
        }
        if (scheduleClusteringInstantTime.isPresent()) {
            LOG.info("Schedule clustering success, now cluster with instant time " + scheduleClusteringInstantTime.get());
            HoodieClusteringJob.Config clusterClusteringConfig = buildHoodieClusteringUtilConfig(tableBasePath, shouldPassInClusteringInstantTime ? scheduleClusteringInstantTime.get() : null, false);
            HoodieClusteringJob clusterClusteringJob = new HoodieClusteringJob(jsc, clusterClusteringConfig);
            clusterClusteringJob.cluster(clusterClusteringConfig.retry);
            LOG.info("Cluster success");
        } else {
            LOG.warn("Schedule clustering failed");
        }
        TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs);
        return true;
    });
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieClusteringJob(org.apache.hudi.utilities.HoodieClusteringJob) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) TopicExistsException(org.apache.kafka.common.errors.TopicExistsException) AnalysisException(org.apache.spark.sql.AnalysisException) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 7 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testAsyncClusteringService.

@Test
public void testAsyncClusteringService() throws Exception {
    String tableBasePath = dfsBasePath + "/asyncClustering";
    // Keep it higher than batch-size to test continuous mode
    int totalRecords = 2000;
    // Initial bulk insert
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
    cfg.continuousMode = true;
    cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
    cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", "true", "3"));
    HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
    deltaStreamerTestRunner(ds, cfg, (r) -> {
        TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
        return true;
    });
    // There should be 4 commits, one of which should be a replace commit
    TestHelpers.assertAtLeastNCommits(4, tableBasePath, dfs);
    TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
    TestHelpers.assertDistinctRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 8 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testDeletePartitions.

@Test
public void testDeletePartitions() throws Exception {
    prepareParquetDFSSource(false, false, "source.avsc", "target.avsc", PROPS_FILENAME_TEST_PARQUET, PARQUET_SOURCE_ROOT, false, "partition_path");
    String tableBasePath = dfsBasePath + "/test_parquet_table" + testNum;
    HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, ParquetDFSSource.class.getName(), null, PROPS_FILENAME_TEST_PARQUET, false, false, 100000, false, null, null, "timestamp", null), jsc);
    deltaStreamer.sync();
    TestHelpers.assertRecordCount(PARQUET_NUM_RECORDS, tableBasePath, sqlContext);
    testNum++;
    prepareParquetDFSFiles(PARQUET_NUM_RECORDS, PARQUET_SOURCE_ROOT);
    prepareParquetDFSSource(false, false);
    // set write operation to DELETE_PARTITION and add transformer to filter only for records with partition HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION
    deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.DELETE_PARTITION, ParquetDFSSource.class.getName(), Collections.singletonList(TestSpecificPartitionTransformer.class.getName()), PROPS_FILENAME_TEST_PARQUET, false, false, 100000, false, null, null, "timestamp", null), jsc);
    deltaStreamer.sync();
    // No records should match the HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION.
    TestHelpers.assertNoPartitionMatch(tableBasePath, sqlContext, HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH);
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) ParquetDFSSource(org.apache.hudi.utilities.sources.ParquetDFSSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 9 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testKafkaTimestampType.

@Test
public void testKafkaTimestampType() throws Exception {
    topicName = "topic" + testNum;
    kafkaCheckpointType = "timestamp";
    prepareJsonKafkaDFSFiles(JSON_KAFKA_NUM_RECORDS, true, topicName);
    prepareJsonKafkaDFSSource(PROPS_FILENAME_TEST_JSON_KAFKA, "earliest", topicName);
    String tableBasePath = dfsBasePath + "/test_json_kafka_table" + testNum;
    HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, JsonKafkaSource.class.getName(), Collections.EMPTY_LIST, PROPS_FILENAME_TEST_JSON_KAFKA, false, true, 100000, false, null, null, "timestamp", String.valueOf(System.currentTimeMillis())), jsc);
    deltaStreamer.sync();
    TestHelpers.assertRecordCount(JSON_KAFKA_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
    prepareJsonKafkaDFSFiles(JSON_KAFKA_NUM_RECORDS, false, topicName);
    deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, JsonKafkaSource.class.getName(), Collections.EMPTY_LIST, PROPS_FILENAME_TEST_JSON_KAFKA, false, true, 100000, false, null, null, "timestamp", String.valueOf(System.currentTimeMillis())), jsc);
    deltaStreamer.sync();
    TestHelpers.assertRecordCount(JSON_KAFKA_NUM_RECORDS * 2, tableBasePath + "/*/*.parquet", sqlContext);
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) JsonKafkaSource(org.apache.hudi.utilities.sources.JsonKafkaSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 10 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testNullSchemaProvider.

@Test
public void testNullSchemaProvider() throws Exception {
    String tableBasePath = dfsBasePath + "/test_table";
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true, false, false, null, null);
    Exception e = assertThrows(HoodieException.class, () -> {
        new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    }, "Should error out when schema provider is not provided");
    LOG.debug("Expected error during reading data from source ", e);
    assertTrue(e.getMessage().contains("Please provide a valid schema provider class!"));
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) TopicExistsException(org.apache.kafka.common.errors.TopicExistsException) AnalysisException(org.apache.spark.sql.AnalysisException) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieDeltaStreamer (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer)37 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)29 Test (org.junit.jupiter.api.Test)20 TypedProperties (org.apache.hudi.common.config.TypedProperties)10 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)9 IOException (java.io.IOException)8 HoodieException (org.apache.hudi.exception.HoodieException)7 TableNotFoundException (org.apache.hudi.exception.TableNotFoundException)7 TopicExistsException (org.apache.kafka.common.errors.TopicExistsException)7 AnalysisException (org.apache.spark.sql.AnalysisException)7 Path (org.apache.hadoop.fs.Path)6 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)6 FileSystem (org.apache.hadoop.fs.FileSystem)5 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)5 HoodieClusteringJob (org.apache.hudi.utilities.HoodieClusteringJob)5 Row (org.apache.spark.sql.Row)5 ValueSource (org.junit.jupiter.params.provider.ValueSource)5 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)4 Properties (java.util.Properties)3 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)3