Search in sources :

Example 11 with HiveSyncConfig

use of org.apache.hudi.hive.HiveSyncConfig in project hudi by apache.

the class HiveQueryNode method execute.

@Override
public void execute(ExecutionContext executionContext, int curItrCount) throws Exception {
    log.info("Executing hive query node {}", this.getName());
    this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
    HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().getProps(), executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().getCfg().targetBasePath, executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat);
    this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
    Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser, hiveSyncConfig.hivePass);
    Statement stmt = con.createStatement();
    stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
    for (String hiveProperty : this.config.getHiveProperties()) {
        executeStatement(hiveProperty, stmt);
    }
    for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
        log.info("Running {}", queryAndResult.getLeft());
        ResultSet res = stmt.executeQuery(queryAndResult.getLeft());
        if (!res.next()) {
            log.info("res.next() was False - typically this means the query returned no rows.");
            assert 0 == queryAndResult.getRight();
        } else {
            Integer result = res.getInt(1);
            if (!queryAndResult.getRight().equals(result)) {
                throw new AssertionError("QUERY: " + queryAndResult.getLeft() + " | EXPECTED RESULT = " + queryAndResult.getRight() + " | ACTUAL RESULT = " + result);
            }
        }
        log.info("Successfully validated query!");
    }
    this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
}
Also used : Statement(java.sql.Statement) Connection(java.sql.Connection) ResultSet(java.sql.ResultSet) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig)

Example 12 with HiveSyncConfig

use of org.apache.hudi.hive.HiveSyncConfig in project hudi by apache.

the class TestHoodieDeltaStreamer method testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline.

/**
 * Test Bulk Insert and upserts with hive syncing. Tests Hudi incremental processing using a 2 step pipeline The first
 * step involves using a SQL template to transform a source TEST-DATA-SOURCE ============================> HUDI TABLE
 * 1 ===============> HUDI TABLE 2 (incr-pull with transform) (incr-pull) Hudi Table 1 is synced with Hive.
 */
@Test
public void testBulkInsertsAndUpsertsWithSQLBasedTransformerFor2StepPipeline() throws Exception {
    String tableBasePath = dfsBasePath + "/test_table2";
    String downstreamTableBasePath = dfsBasePath + "/test_downstream_table2";
    HiveSyncConfig hiveSyncConfig = getHiveSyncConfig(tableBasePath, "hive_trips");
    // Initial bulk insert to ingest to first hudi table
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true);
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath + "/*/*.parquet", sqlContext);
    String lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // Now incrementally pull from the above hudi table and ingest to second table
    HoodieDeltaStreamer.Config downstreamCfg = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT, true, null);
    new HoodieDeltaStreamer(downstreamCfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 1);
    // No new data => no commits for upstream table
    cfg.sourceLimit = 0;
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
    // with no change in upstream table, no change in downstream too when pulled.
    HoodieDeltaStreamer.Config downstreamCfg1 = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT, true, DummySchemaProvider.class.getName());
    new HoodieDeltaStreamer(downstreamCfg1, jsc).sync();
    TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 1);
    // upsert() #1 on upstream hudi table
    cfg.sourceLimit = 2000;
    cfg.operation = WriteOperationType.UPSERT;
    new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
    TestHelpers.assertRecordCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(1950, tableBasePath + "/*/*.parquet", sqlContext);
    lastInstantForUpstreamTable = TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
    List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
    assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
    // Incrementally pull changes in upstream hudi table and apply to downstream table
    downstreamCfg = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.UPSERT, false, null);
    downstreamCfg.sourceLimit = 2000;
    new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
    TestHelpers.assertRecordCount(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCount(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    TestHelpers.assertDistanceCountWithExactValue(2000, downstreamTableBasePath + "/*/*.parquet", sqlContext);
    String finalInstant = TestHelpers.assertCommitMetadata(lastInstantForUpstreamTable, downstreamTableBasePath, dfs, 2);
    counts = TestHelpers.countsPerCommit(downstreamTableBasePath + "/*/*.parquet", sqlContext);
    assertEquals(2000, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
    // Test Hive integration
    HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), dfs);
    assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName), "Table " + hiveSyncConfig.tableName + " should exist");
    assertEquals(1, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(), "Table partitions should match the number of partitions we wrote");
    assertEquals(lastInstantForUpstreamTable, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(), "The last commit that was synced should be updated in the TBLPROPERTIES");
}
Also used : HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) DummySchemaProvider(org.apache.hudi.utilities.DummySchemaProvider) HoodieHiveClient(org.apache.hudi.hive.HoodieHiveClient) Row(org.apache.spark.sql.Row) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 13 with HiveSyncConfig

use of org.apache.hudi.hive.HiveSyncConfig in project hudi by apache.

the class TestHiveIncrementalPuller method testPuller.

@Test
public void testPuller() throws IOException, URISyntaxException {
    createTables();
    HiveIncrementalPuller.Config cfg = getHivePullerConfig("select name from testdb.test1 where `_hoodie_commit_time` > '%s'");
    HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
    hiveClient.createDatabase(cfg.tmpDb);
    HiveIncrementalPuller puller = new HiveIncrementalPuller(cfg);
    puller.saveDelta();
    HiveSyncConfig assertingConfig = getAssertionSyncConfig(cfg.tmpDb);
    HoodieHiveClient assertingClient = new HoodieHiveClient(assertingConfig, HiveTestUtil.getHiveConf(), fileSystem);
    String tmpTable = cfg.targetTable + "__" + cfg.sourceTable;
    assertTrue(assertingClient.doesTableExist(tmpTable));
}
Also used : HoodieHiveClient(org.apache.hudi.hive.HoodieHiveClient) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) Test(org.junit.jupiter.api.Test)

Example 14 with HiveSyncConfig

use of org.apache.hudi.hive.HiveSyncConfig in project hudi by apache.

the class TestHiveIncrementalPuller method getAssertionSyncConfig.

private HiveSyncConfig getAssertionSyncConfig(String databaseName) {
    HiveSyncConfig config = HiveSyncConfig.copy(hiveSyncConfig);
    config.databaseName = databaseName;
    return config;
}
Also used : HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig)

Example 15 with HiveSyncConfig

use of org.apache.hudi.hive.HiveSyncConfig in project hudi by apache.

the class HiveTestUtil method setUp.

public static void setUp() throws IOException, InterruptedException, HiveException, MetaException {
    configuration = new Configuration();
    if (zkServer == null) {
        zkService = new ZookeeperTestService(configuration);
        zkServer = zkService.start();
    }
    if (hiveServer == null) {
        hiveTestService = new HiveTestService(configuration);
        hiveServer = hiveTestService.start();
    }
    fileSystem = FileSystem.get(configuration);
    hiveSyncConfig = new HiveSyncConfig();
    hiveSyncConfig.jdbcUrl = hiveTestService.getJdbcHive2Url();
    hiveSyncConfig.hiveUser = "";
    hiveSyncConfig.hivePass = "";
    hiveSyncConfig.databaseName = "testdb";
    hiveSyncConfig.tableName = "test1";
    hiveSyncConfig.basePath = Files.createTempDirectory("hivesynctest" + Instant.now().toEpochMilli()).toUri().toString();
    hiveSyncConfig.assumeDatePartitioning = true;
    hiveSyncConfig.usePreApacheInputFormat = false;
    hiveSyncConfig.partitionFields = Collections.singletonList("datestr");
    dtfOut = DateTimeFormatter.ofPattern("yyyy/MM/dd");
    ddlExecutor = new HiveQueryDDLExecutor(hiveSyncConfig, fileSystem, getHiveConf());
    clear();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ZookeeperTestService(org.apache.hudi.common.testutils.minicluster.ZookeeperTestService) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) HiveQueryDDLExecutor(org.apache.hudi.hive.ddl.HiveQueryDDLExecutor)

Aggregations

HiveSyncConfig (org.apache.hudi.hive.HiveSyncConfig)20 HiveConf (org.apache.hadoop.hive.conf.HiveConf)5 HiveSyncTool (org.apache.hudi.hive.HiveSyncTool)3 HiveQueryDDLExecutor (org.apache.hudi.hive.ddl.HiveQueryDDLExecutor)3 FileSystem (org.apache.hadoop.fs.FileSystem)2 TypedProperties (org.apache.hudi.common.config.TypedProperties)2 HoodieHiveClient (org.apache.hudi.hive.HoodieHiveClient)2 Test (org.junit.jupiter.api.Test)2 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)2 Connection (java.sql.Connection)1 ResultSet (java.sql.ResultSet)1 Statement (java.sql.Statement)1 ArrayList (java.util.ArrayList)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 ZookeeperTestService (org.apache.hudi.common.testutils.minicluster.ZookeeperTestService)1 JDBCExecutor (org.apache.hudi.hive.ddl.JDBCExecutor)1 QueryBasedDDLExecutor (org.apache.hudi.hive.ddl.QueryBasedDDLExecutor)1 DummySchemaProvider (org.apache.hudi.utilities.DummySchemaProvider)1 HoodieDeltaStreamer (org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer)1