Search in sources :

Example 1 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testUpsertsContinuousMode.

private void testUpsertsContinuousMode(HoodieTableType tableType, String tempDir) throws Exception {
    String tableBasePath = dfsBasePath + "/" + tempDir;
    // Keep it higher than batch-size to test continuous mode
    int totalRecords = 3000;
    // Initial bulk insert
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT);
    cfg.continuousMode = true;
    cfg.tableType = tableType.name();
    cfg.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
    cfg.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
    deltaStreamerTestRunner(ds, cfg, (r) -> {
        if (tableType.equals(HoodieTableType.MERGE_ON_READ)) {
            TestHelpers.assertAtleastNDeltaCommits(5, tableBasePath, dfs);
            TestHelpers.assertAtleastNCompactionCommits(2, tableBasePath, dfs);
        } else {
            TestHelpers.assertAtleastNCompactionCommits(5, tableBasePath, dfs);
        }
        TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
        TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
        return true;
    });
}
Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer)

Example 2 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testFilterDupes.

@Test
public void testFilterDupes() throws Exception {
    String tableBasePath = dfsBasePath + "/test_dupes_table";
    // Initial bulk insert
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT);
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // Generate the same 1000 records + 1000 new ones for upsert
    cfg.filterDupes = true;
    cfg.sourceLimit = 2000;
    cfg.operation = WriteOperationType.INSERT;
    new HoodieDeltaStreamer(cfg, jsc).sync();
    TestHelpers.assertRecordCount(2000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
    // 1000 records for commit 00000 & 1000 for commit 00001
    List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
    assertEquals(1000, counts.get(0).getLong(1));
    assertEquals(1000, counts.get(1).getLong(1));
    // Test with empty commits
    HoodieTableMetaClient mClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(tableBasePath).setLoadActiveTimelineOnLoad(true).build();
    HoodieInstant lastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get();
    HoodieDeltaStreamer.Config cfg2 = TestHelpers.makeDropAllConfig(tableBasePath, WriteOperationType.UPSERT);
    cfg2.filterDupes = false;
    cfg2.sourceLimit = 2000;
    cfg2.operation = WriteOperationType.UPSERT;
    cfg2.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
    HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg2, jsc);
    ds2.sync();
    mClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(tableBasePath).setLoadActiveTimelineOnLoad(true).build();
    HoodieInstant newLastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get();
    assertTrue(HoodieTimeline.compareTimestamps(newLastFinished.getTimestamp(), HoodieTimeline.GREATER_THAN, lastFinished.getTimestamp()));
    // Ensure it is empty
    HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(mClient.getActiveTimeline().getInstantDetails(newLastFinished).get(), HoodieCommitMetadata.class);
    System.out.println("New Commit Metadata=" + commitMetadata);
    assertTrue(commitMetadata.getPartitionToWriteStats().isEmpty());
    // Try UPSERT with filterDupes true. Expect exception
    cfg2.filterDupes = true;
    cfg2.operation = WriteOperationType.UPSERT;
    try {
        new HoodieDeltaStreamer(cfg2, jsc).sync();
    } catch (IllegalArgumentException e) {
        assertTrue(e.getMessage().contains("'--filter-dupes' needs to be disabled when '--op' is 'UPSERT' to ensure updates are not missed."));
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) Row(org.apache.spark.sql.Row) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 3 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testHoodieIncrFallback.

@Test
public void testHoodieIncrFallback() throws Exception {
    String tableBasePath = dfsBasePath + "/incr_test_table";
    String downstreamTableBasePath = dfsBasePath + "/incr_test_downstream_table";
    insertInTable(tableBasePath, 1, WriteOperationType.BULK_INSERT);
    HoodieDeltaStreamer.Config downstreamCfg = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT, true, null);
    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
    insertInTable(tableBasePath, 9, WriteOperationType.UPSERT);
    // No change as this fails with Path not exist error
    assertThrows(org.apache.spark.sql.AnalysisException.class, () -> new HoodieDeltaStreamer(downstreamCfg, jsc).sync());
    TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*", sqlContext);
    if (downstreamCfg.configs == null) {
        downstreamCfg.configs = new ArrayList<>();
    }
    downstreamCfg.configs.add(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES().key() + "=true");
    // Adding this conf to make testing easier :)
    downstreamCfg.configs.add("hoodie.deltastreamer.source.hoodieincr.num_instants=10");
    downstreamCfg.operation = WriteOperationType.UPSERT;
    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
    long baseTableRecords = sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*.parquet").count();
    long downStreamTableRecords = sqlContext.read().format("org.apache.hudi").load(downstreamTableBasePath + "/*/*.parquet").count();
    assertEquals(baseTableRecords, downStreamTableRecords);
}
Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 4 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testPayloadClassUpdateWithCOWTable.

@Test
public void testPayloadClassUpdateWithCOWTable() throws Exception {
    String dataSetBasePath = dfsBasePath + "/test_dataset_cow";
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true, true, false, null, null);
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1000, dataSetBasePath + "/*/*.parquet", sqlContext);
    // now create one more deltaStreamer instance and update payload class
    cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true, true, true, DummyAvroPayload.class.getName(), null);
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf());
    // now assert that hoodie.properties file does not have payload class prop since it is a COW table
    Properties props = new Properties();
    String metaPath = dataSetBasePath + "/.hoodie/hoodie.properties";
    FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jsc.hadoopConfiguration());
    try (FSDataInputStream inputStream = fs.open(new Path(metaPath))) {
        props.load(inputStream);
    }
    assertFalse(props.containsKey(HoodieTableConfig.PAYLOAD_CLASS_NAME.key()));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) SqlQueryBasedTransformer(org.apache.hudi.utilities.transform.SqlQueryBasedTransformer) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 5 with HoodieDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.

the class TestHoodieDeltaStreamer method testSqlSourceSource.

@Test
public void testSqlSourceSource() throws Exception {
    prepareSqlSource();
    String tableBasePath = dfsBasePath + "/test_sql_source_table" + testNum++;
    HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, SqlSource.class.getName(), Collections.emptyList(), PROPS_FILENAME_TEST_SQL_SOURCE, false, false, 1000, false, null, null, "timestamp", null, true), jsc);
    deltaStreamer.sync();
    TestHelpers.assertRecordCount(SQL_SOURCE_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
}
Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieDeltaStreamer (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer)37 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)29 Test (org.junit.jupiter.api.Test)20 TypedProperties (org.apache.hudi.common.config.TypedProperties)10 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)9 IOException (java.io.IOException)8 HoodieException (org.apache.hudi.exception.HoodieException)7 TableNotFoundException (org.apache.hudi.exception.TableNotFoundException)7 TopicExistsException (org.apache.kafka.common.errors.TopicExistsException)7 AnalysisException (org.apache.spark.sql.AnalysisException)7 Path (org.apache.hadoop.fs.Path)6 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)6 FileSystem (org.apache.hadoop.fs.FileSystem)5 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)5 HoodieClusteringJob (org.apache.hudi.utilities.HoodieClusteringJob)5 Row (org.apache.spark.sql.Row)5 ValueSource (org.junit.jupiter.params.provider.ValueSource)5 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)4 Properties (java.util.Properties)3 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)3