Search in sources :

Example 1 with HoodieHiveClient

use of org.apache.hudi.hive.HoodieHiveClient in project hudi by apache.

the class TestHoodieDeltaStreamer method testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline.

/**
 * Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline The first
 * step involves using a SQL template to transform a source TEST-DATA-SOURCE ============================> HUDI TABLE
 * 1 ===============> HUDI TABLE 2 (incr-pull with transform) (incr-pull) Hudi Table 1 is synced with Hive.
 */
@Test
public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception {
    String tableBasePath = dfsBasePath + "/test_table2";
    String downstreamTableBasePath = dfsBasePath + "/test_downstream_table2";
    HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(tableBasePath, "hive_trips");
    // Initial bulk insert to ingest to first hudi table
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true);
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath + "/*/*.parquet", sqlContext);
    String lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // Now incrementally pull from the above hudi table and ingest to second table
    HoodieDeltaStreamer.Config downstreamCfg = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT, true, null);
    new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 1);
    // No new data => no commits for upstream table
    cfg.sourceLimit = 0;
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // with no change in upstream table, no change in downstream too when pulled.
    HoodieDeltaStreamer.Config downstreamCfg1 = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT, true, DummySchemaProvider.class.getName());
    new HoodieDeltaStreamer(downstreamCfg1, jsc).sync();
    TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 1);
    // upsert() #1 on upstream hudi table
    cfg.sourceLimit = 2000;
    cfg.operation = WriteOperationType.UPSERT;
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1950, tableBasePath + "/*/*.parquet", sqlContext);
    lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
    List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
    assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
    // Incrementally pull changes in upstream hudi table and apply to downstream table
    downstreamCfg = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.UPSERT, false, null);
    downstreamCfg.sourceLimit = 2000;
    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
    TestHelpers.assertRecordCount(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    String finalInstant = TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 2);
    counts = TestHelpers.countsPerCommit(downstreamTableBasePath + "/*/*.parquet", sqlContext);
    assertEquals(2000, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
    // Test Hive integration
    HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs);
    assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName), "Table " + hiveSyncConfig.tableName + " should exist");
    assertEquals(1, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), "Table partitions should match the number of partitions we wrote");
    assertEquals(lastInstantForUpstreamTable, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), "The last commit that was synced should be updated in the TBLPROPERTIES");
}
Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) DummySchemaProvider(org.apache.hudi.utilities.DummySchemaProvider) HoodieHiveClient(org.apache.hudi.hive.HoodieHiveClient) Row(org.apache.spark.sql.Row) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 2 with HoodieHiveClient

use of org.apache.hudi.hive.HoodieHiveClient in project hudi by apache.

the class TestHiveIncrementalPuller method testPuller.

@Test
public void testPuller() throws IOException, URISyntaxException {
    createTables();
    HiveIncrementalPuller.Config cfg = getHivePullerConfig("select name from testdb.test1 where `_hoodie_commit_time` > '%s'");
    HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
    hiveClient.createDatabase(cfg.tmpDb);
    HiveIncrementalPuller puller = new HiveIncrementalPuller(cfg);
    puller.saveDelta();
    HiveSyncConfig assertingConfig = getAssertionSyncConfig(cfg.tmpDb);
    HoodieHiveClient assertingClient = new HoodieHiveClient(assertingConfig, HiveTestUtil.getHiveConf(), fileSystem);
    String tmpTable = cfg.targetTable + "__" + cfg.sourceTable;
    assertTrue(assertingClient.doesTableExist(tmpTable));
}
Also used : HoodieHiveClient(org.apache.hudi.hive.HoodieHiveClient) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) Test(org.junit.jupiter.api.Test)

Aggregations

HiveSyncConfig (org.apache.hudi.hive.HiveSyncConfig)2 HoodieHiveClient (org.apache.hudi.hive.HoodieHiveClient)2 Test (org.junit.jupiter.api.Test)2 DummySchemaProvider (org.apache.hudi.utilities.DummySchemaProvider)1 HoodieDeltaStreamer (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer)1 Row (org.apache.spark.sql.Row)1 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)1