Examples with HoodieDeltaStreamer - org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer

Example 21 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testDeltaSyncWithPendingClustering.

@Test
public void testDeltaSyncWithPendingClustering() throws Exception {
    String tableBasePath = dfsBasePath + "/inlineClusteringPending";
    // ingest data
    int totalRecords = 2000;
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
    cfg.continuousMode = false;
    cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
    HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
    ds.sync();
    // assert ingest successful
    TestHelpers.assertAtLeastNCommits(1, tableBasePath, dfs);
    // schedule a clustering job to build a clustering plan and transition to inflight
    HoodieClusteringJob clusteringJob = initialHoodieClusteringJob(tableBasePath, null, false, "schedule");
    clusteringJob.cluster(0);
    HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build();
    List<HoodieInstant> hoodieClusteringInstants = meta.getActiveTimeline().filterPendingReplaceTimeline().getInstants().collect(Collectors.toList());
    HoodieInstant clusteringRequest = hoodieClusteringInstants.get(0);
    meta.getActiveTimeline().transitionReplaceRequestedToInflight(clusteringRequest, Option.empty());
    // do another ingestion with inline clustering enabled
    cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", ""));
    cfg.retryLastPendingInlineClusteringJob = true;
    HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg, jsc);
    ds2.sync();
    String completeClusteringTimeStamp = meta.reloadActiveTimeline().getCompletedReplaceTimeline().lastInstant().get().getTimestamp();
    assertEquals(clusteringRequest.getTimestamp(), completeClusteringTimeStamp);
    TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
    TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
}

Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieClusteringJob(org.apache.hudi.utilities.HoodieClusteringJob) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 22 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testBulkInsertsAndUpsertsWithBootstrap.

@Test
public void testBulkInsertsAndUpsertsWithBootstrap() throws Exception {
    String tableBasePath = dfsBasePath + "/test_table";
    // Initial bulk insert
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT);
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // No new data => no commits.
    cfg.sourceLimit = 0;
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // upsert() #1
    cfg.sourceLimit = 2000;
    cfg.operation = WriteOperationType.UPSERT;
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
    List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
    assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
    // Perform bootstrap with tableBasePath as source
    String bootstrapSourcePath = dfsBasePath + "/src_bootstrapped";
    Dataset<Row> sourceDf = sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*.parquet");
    sourceDf.write().format("parquet").save(bootstrapSourcePath);
    String newDatasetBasePath = dfsBasePath + "/test_dataset_bootstrapped";
    cfg.runBootstrap = true;
    cfg.configs.add(String.format("hoodie.bootstrap.base.path=%s", bootstrapSourcePath));
    cfg.configs.add(String.format("hoodie.bootstrap.keygen.class=%s", SimpleKeyGenerator.class.getName()));
    cfg.configs.add("hoodie.bootstrap.parallelism=5");
    cfg.targetBasePath = newDatasetBasePath;
    new HoodieDeltaStreamer(cfg, jsc).sync();
    Dataset<Row> res = sqlContext.read().format("org.apache.hudi").load(newDatasetBasePath + "/*.parquet");
    LOG.info("Schema :");
    res.printSchema();
    TestHelpers.assertRecordCount(1950, newDatasetBasePath + "/*.parquet", sqlContext);
    res.registerTempTable("bootstrapped");
    assertEquals(1950, sqlContext.sql("select distinct _hoodie_record_key from bootstrapped").count());
    StructField[] fields = res.schema().fields();
    List<String> fieldNames = Arrays.asList(res.schema().fieldNames());
    List<String> expectedFieldNames = Arrays.asList(sourceDf.schema().fieldNames());
    assertEquals(expectedFieldNames.size(), fields.length);
    assertTrue(fieldNames.containsAll(HoodieRecord.HOODIE_META_COLUMNS));
    assertTrue(fieldNames.containsAll(expectedFieldNames));
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 23 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testAsyncClusteringServiceWithCompaction.

@Test
public void testAsyncClusteringServiceWithCompaction() throws Exception {
    String tableBasePath = dfsBasePath + "/asyncClusteringCompaction";
    // Keep it higher than batch-size to test continuous mode
    int totalRecords = 2000;
    // Initial bulk insert
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
    cfg.continuousMode = true;
    cfg.tableType = HoodieTableType.MERGE_ON_READ.name();
    cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", "true", "3"));
    HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
    deltaStreamerTestRunner(ds, cfg, (r) -> {
        TestHelpers.assertAtleastNCompactionCommits(2, tableBasePath, dfs);
        TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
        return true;
    });
    // There should be 4 commits, one of which should be a replace commit
    TestHelpers.assertAtLeastNCommits(4, tableBasePath, dfs);
    TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
    TestHelpers.assertDistinctRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 24 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testJsonKafkaDFSSource.

@Test
public void testJsonKafkaDFSSource() throws Exception {
    topicName = "topic" + testNum;
    prepareJsonKafkaDFSFiles(JSON_KAFKA_NUM_RECORDS, true, topicName);
    prepareJsonKafkaDFSSource(PROPS_FILENAME_TEST_JSON_KAFKA, "earliest", topicName);
    String tableBasePath = dfsBasePath + "/test_json_kafka_table" + testNum;
    HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, JsonKafkaSource.class.getName(), Collections.EMPTY_LIST, PROPS_FILENAME_TEST_JSON_KAFKA, false, true, 100000, false, null, null, "timestamp", null), jsc);
    deltaStreamer.sync();
    TestHelpers.assertRecordCount(JSON_KAFKA_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
    int totalRecords = JSON_KAFKA_NUM_RECORDS;
    int records = 10;
    totalRecords += records;
    prepareJsonKafkaDFSFiles(records, false, topicName);
    deltaStreamer.sync();
    TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
}

Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 25 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testTableCreation.

@Test
public void testTableCreation() throws Exception {
    Exception e = assertThrows(TableNotFoundException.class, () -> {
        dfs.mkdirs(new Path(dfsBasePath + "/not_a_table"));
        HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(dfsBasePath + "/not_a_table", WriteOperationType.BULK_INSERT), jsc);
        deltaStreamer.sync();
    }, "Should error out when pointed out at a dir thats not a table");
    // expected
    LOG.debug("Expected error during table creation", e);
}

Also used : Path(org.apache.hadoop.fs.Path) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) TopicExistsException(org.apache.kafka.common.errors.TopicExistsException) AnalysisException(org.apache.spark.sql.AnalysisException) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieDeltaStreamer (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer)37 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)29 Test (org.junit.jupiter.api.Test)20 TypedProperties (org.apache.hudi.common.config.TypedProperties)10 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)9 IOException (java.io.IOException)8 HoodieException (org.apache.hudi.exception.HoodieException)7 TableNotFoundException (org.apache.hudi.exception.TableNotFoundException)7 TopicExistsException (org.apache.kafka.common.errors.TopicExistsException)7 AnalysisException (org.apache.spark.sql.AnalysisException)7 Path (org.apache.hadoop.fs.Path)6 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)6 FileSystem (org.apache.hadoop.fs.FileSystem)5 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)5 HoodieClusteringJob (org.apache.hudi.utilities.HoodieClusteringJob)5 Row (org.apache.spark.sql.Row)5 ValueSource (org.junit.jupiter.params.provider.ValueSource)5 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)4 Properties (java.util.Properties)3 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)3