Search in sources :

Example 76 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HiveSourceTest method testGetWorkunitsAfterWatermark.

@Test
public void testGetWorkunitsAfterWatermark() throws Exception {
    String dbName = "testdb4";
    String tableName1 = "testtable1";
    String tableSdLoc1 = "/tmp/testtable1";
    String tableName2 = "testtable2";
    String tableSdLoc2 = "/tmp/testtable2";
    this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
    this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName1, tableSdLoc1, Optional.<String>absent());
    this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName2, tableSdLoc2, Optional.<String>absent(), true);
    List<WorkUnitState> previousWorkUnitStates = Lists.newArrayList();
    Table table1 = this.hiveMetastoreTestUtils.getLocalMetastoreClient().getTable(dbName, tableName1);
    previousWorkUnitStates.add(ConversionHiveTestUtils.createWus(dbName, tableName1, TimeUnit.MILLISECONDS.convert(table1.getCreateTime(), TimeUnit.SECONDS)));
    SourceState testState = new SourceState(getTestState(dbName), previousWorkUnitStates);
    testState.setProp(HiveSource.HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, TableLevelWatermarker.Factory.class.getName());
    List<WorkUnit> workUnits = this.hiveSource.getWorkunits(testState);
    Assert.assertEquals(workUnits.size(), 1);
    WorkUnit wu = workUnits.get(0);
    HiveWorkUnit hwu = new HiveWorkUnit(wu);
    Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName);
    Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName2);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.metastore.api.Table) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) HiveWorkUnit(org.apache.gobblin.data.management.conversion.hive.source.HiveWorkUnit) HiveWorkUnit(org.apache.gobblin.data.management.conversion.hive.source.HiveWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 77 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HiveSourceTest method testGetWorkUnitsForPartitions.

@Test
public void testGetWorkUnitsForPartitions() throws Exception {
    String dbName = "testdb3";
    String tableName = "testtable3";
    String tableSdLoc = "/tmp/testtable3";
    this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
    SourceState testState = getTestState(dbName);
    Table tbl = this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.of("field"));
    this.hiveMetastoreTestUtils.addTestPartition(tbl, ImmutableList.of("f1"), (int) System.currentTimeMillis());
    List<WorkUnit> workUnits = this.hiveSource.getWorkunits(testState);
    // One workunit for the partition + 1 dummy watermark workunit
    Assert.assertEquals(workUnits.size(), 2);
    WorkUnit wu = workUnits.get(0);
    WorkUnit wu2 = workUnits.get(1);
    HiveWorkUnit hwu = null;
    if (!wu.contains(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY)) {
        hwu = new HiveWorkUnit(wu);
    } else {
        hwu = new HiveWorkUnit(wu2);
    }
    Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName);
    Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName);
    Assert.assertEquals(hwu.getPartitionName().get(), "field=f1");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.metastore.api.Table) HiveWorkUnit(org.apache.gobblin.data.management.conversion.hive.source.HiveWorkUnit) HiveWorkUnit(org.apache.gobblin.data.management.conversion.hive.source.HiveWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 78 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class ConvertibleHiveDatasetTest method testLineageInfo.

@Test
public void testLineageInfo() throws Exception {
    String testConfFilePath = "convertibleHiveDatasetTest/flattenedAndNestedOrc.conf";
    Config config = ConfigFactory.parseResources(testConfFilePath).getConfig("hive.conversion.avro");
    WorkUnit workUnit = WorkUnit.createEmpty();
    Gson GSON = new Gson();
    HiveSource.setLineageInfo(createTestConvertibleDataset(config), workUnit, getSharedJobBroker());
    Properties props = workUnit.getSpecProperties();
    // Asset that lineage name is correct
    Assert.assertEquals(props.getProperty("gobblin.event.lineage.name"), "db1.tb1");
    // Assert that source is correct for lineage event
    Assert.assertTrue(props.containsKey("gobblin.event.lineage.source"));
    DatasetDescriptor sourceDD = GSON.fromJson(props.getProperty("gobblin.event.lineage.source"), DatasetDescriptor.class);
    Assert.assertEquals(sourceDD.getPlatform(), DatasetConstants.PLATFORM_HIVE);
    Assert.assertEquals(sourceDD.getName(), "db1.tb1");
    // Assert that first dest is correct for lineage event
    Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.1.destination"));
    DatasetDescriptor destDD1 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.1.destination"), DatasetDescriptor.class);
    Assert.assertEquals(destDD1.getPlatform(), DatasetConstants.PLATFORM_HIVE);
    Assert.assertEquals(destDD1.getName(), "db1_nestedOrcDb.tb1_nestedOrc");
    // Assert that second dest is correct for lineage event
    Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.2.destination"));
    DatasetDescriptor destDD2 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.2.destination"), DatasetDescriptor.class);
    Assert.assertEquals(destDD2.getPlatform(), DatasetConstants.PLATFORM_HIVE);
    Assert.assertEquals(destDD2.getName(), "db1_flattenedOrcDb.tb1_flattenedOrc");
    // Assert that there are two eventBuilders for nestedOrc and flattenedOrc
    Collection<LineageEventBuilder> lineageEventBuilders = LineageInfo.load(Collections.singleton(workUnit));
    Assert.assertEquals(lineageEventBuilders.size(), 2);
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) Config(com.typesafe.config.Config) ConversionConfig(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset.ConversionConfig) Gson(com.google.gson.Gson) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) LineageEventBuilder(org.apache.gobblin.metrics.event.lineage.LineageEventBuilder) Properties(java.util.Properties) Test(org.testng.annotations.Test)

Example 79 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HiveMaterializerTest method testMaterializeView.

@Test
public void testMaterializeView() throws Exception {
    String destinationTable = "materializeView";
    File tmpDir = Files.createTempDir();
    tmpDir.deleteOnExit();
    String viewName = "myView";
    this.jdbcConnector.executeStatements(String.format("CREATE VIEW %s.%s AS SELECT * FROM %s.%s WHERE name = 'foo'", this.dbName, viewName, this.dbName, this.sourceTableName));
    Table view;
    try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
        view = new Table(client.get().getTable(this.dbName, viewName));
    }
    HiveDataset viewDataset = new HiveDataset(FileSystem.getLocal(new Configuration()), pool, view, new Properties());
    WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(viewDataset, HiveConverterUtils.StorageFormat.AVRO, new TableLikeStageableTableMetadata(viewDataset.getTable(), this.dbName, destinationTable, tmpDir.getAbsolutePath()), null);
    HiveMaterializer hiveMaterializer = new HiveMaterializer(getTaskContextForRun(workUnit));
    hiveMaterializer.run();
    Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
    hiveMaterializer.commit();
    Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
    List<List<String>> allTable = executeStatementAndGetResults(this.jdbcConnector, String.format("SELECT * FROM %s.%s", this.dbName, destinationTable), 3);
    Assert.assertEquals(allTable.size(), 4);
    Assert.assertEquals(allTable.stream().map(l -> l.get(0)).collect(Collectors.toList()), Lists.newArrayList("101", "103", "201", "203"));
}
Also used : Table(org.apache.hadoop.hive.ql.metadata.Table) Configuration(org.apache.hadoop.conf.Configuration) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) Properties(java.util.Properties) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) ArrayList(java.util.ArrayList) List(java.util.List) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) TableLikeStageableTableMetadata(org.apache.gobblin.data.management.conversion.hive.entities.TableLikeStageableTableMetadata) File(java.io.File) Test(org.testng.annotations.Test)

Example 80 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HiveMaterializerTest method testMaterializeQuery.

@Test
public void testMaterializeQuery() throws Exception {
    String destinationTable = "materializeQuery";
    File tmpDir = Files.createTempDir();
    tmpDir.deleteOnExit();
    WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(String.format("SELECT * FROM %s.%s WHERE name = 'foo'", this.dbName, this.sourceTableName), HiveConverterUtils.StorageFormat.AVRO, new TableLikeStageableTableMetadata(this.dataset.getTable(), this.dbName, destinationTable, tmpDir.getAbsolutePath()));
    HiveMaterializer hiveMaterializer = new HiveMaterializer(getTaskContextForRun(workUnit));
    hiveMaterializer.run();
    Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
    hiveMaterializer.commit();
    Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
    List<List<String>> allTable = executeStatementAndGetResults(this.jdbcConnector, String.format("SELECT * FROM %s.%s", this.dbName, destinationTable), 3);
    Assert.assertEquals(allTable.size(), 4);
    Assert.assertEquals(allTable.stream().map(l -> l.get(0)).collect(Collectors.toList()), Lists.newArrayList("101", "103", "201", "203"));
}
Also used : ArrayList(java.util.ArrayList) List(java.util.List) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) TableLikeStageableTableMetadata(org.apache.gobblin.data.management.conversion.hive.entities.TableLikeStageableTableMetadata) File(java.io.File) Test(org.testng.annotations.Test)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6