use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HiveSourceTest method testGetWorkunitsAfterWatermark.
@Test
public void testGetWorkunitsAfterWatermark() throws Exception {
String dbName = "testdb4";
String tableName1 = "testtable1";
String tableSdLoc1 = "/tmp/testtable1";
String tableName2 = "testtable2";
String tableSdLoc2 = "/tmp/testtable2";
this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName1, tableSdLoc1, Optional.<String>absent());
this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName2, tableSdLoc2, Optional.<String>absent(), true);
List<WorkUnitState> previousWorkUnitStates = Lists.newArrayList();
Table table1 = this.hiveMetastoreTestUtils.getLocalMetastoreClient().getTable(dbName, tableName1);
previousWorkUnitStates.add(ConversionHiveTestUtils.createWus(dbName, tableName1, TimeUnit.MILLISECONDS.convert(table1.getCreateTime(), TimeUnit.SECONDS)));
SourceState testState = new SourceState(getTestState(dbName), previousWorkUnitStates);
testState.setProp(HiveSource.HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, TableLevelWatermarker.Factory.class.getName());
List<WorkUnit> workUnits = this.hiveSource.getWorkunits(testState);
Assert.assertEquals(workUnits.size(), 1);
WorkUnit wu = workUnits.get(0);
HiveWorkUnit hwu = new HiveWorkUnit(wu);
Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName);
Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName2);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HiveSourceTest method testGetWorkUnitsForPartitions.
@Test
public void testGetWorkUnitsForPartitions() throws Exception {
String dbName = "testdb3";
String tableName = "testtable3";
String tableSdLoc = "/tmp/testtable3";
this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true);
SourceState testState = getTestState(dbName);
Table tbl = this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.of("field"));
this.hiveMetastoreTestUtils.addTestPartition(tbl, ImmutableList.of("f1"), (int) System.currentTimeMillis());
List<WorkUnit> workUnits = this.hiveSource.getWorkunits(testState);
// One workunit for the partition + 1 dummy watermark workunit
Assert.assertEquals(workUnits.size(), 2);
WorkUnit wu = workUnits.get(0);
WorkUnit wu2 = workUnits.get(1);
HiveWorkUnit hwu = null;
if (!wu.contains(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY)) {
hwu = new HiveWorkUnit(wu);
} else {
hwu = new HiveWorkUnit(wu2);
}
Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName);
Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName);
Assert.assertEquals(hwu.getPartitionName().get(), "field=f1");
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class ConvertibleHiveDatasetTest method testLineageInfo.
@Test
public void testLineageInfo() throws Exception {
String testConfFilePath = "convertibleHiveDatasetTest/flattenedAndNestedOrc.conf";
Config config = ConfigFactory.parseResources(testConfFilePath).getConfig("hive.conversion.avro");
WorkUnit workUnit = WorkUnit.createEmpty();
Gson GSON = new Gson();
HiveSource.setLineageInfo(createTestConvertibleDataset(config), workUnit, getSharedJobBroker());
Properties props = workUnit.getSpecProperties();
// Asset that lineage name is correct
Assert.assertEquals(props.getProperty("gobblin.event.lineage.name"), "db1.tb1");
// Assert that source is correct for lineage event
Assert.assertTrue(props.containsKey("gobblin.event.lineage.source"));
DatasetDescriptor sourceDD = GSON.fromJson(props.getProperty("gobblin.event.lineage.source"), DatasetDescriptor.class);
Assert.assertEquals(sourceDD.getPlatform(), DatasetConstants.PLATFORM_HIVE);
Assert.assertEquals(sourceDD.getName(), "db1.tb1");
// Assert that first dest is correct for lineage event
Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.1.destination"));
DatasetDescriptor destDD1 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.1.destination"), DatasetDescriptor.class);
Assert.assertEquals(destDD1.getPlatform(), DatasetConstants.PLATFORM_HIVE);
Assert.assertEquals(destDD1.getName(), "db1_nestedOrcDb.tb1_nestedOrc");
// Assert that second dest is correct for lineage event
Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.2.destination"));
DatasetDescriptor destDD2 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.2.destination"), DatasetDescriptor.class);
Assert.assertEquals(destDD2.getPlatform(), DatasetConstants.PLATFORM_HIVE);
Assert.assertEquals(destDD2.getName(), "db1_flattenedOrcDb.tb1_flattenedOrc");
// Assert that there are two eventBuilders for nestedOrc and flattenedOrc
Collection<LineageEventBuilder> lineageEventBuilders = LineageInfo.load(Collections.singleton(workUnit));
Assert.assertEquals(lineageEventBuilders.size(), 2);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HiveMaterializerTest method testMaterializeView.
@Test
public void testMaterializeView() throws Exception {
String destinationTable = "materializeView";
File tmpDir = Files.createTempDir();
tmpDir.deleteOnExit();
String viewName = "myView";
this.jdbcConnector.executeStatements(String.format("CREATE VIEW %s.%s AS SELECT * FROM %s.%s WHERE name = 'foo'", this.dbName, viewName, this.dbName, this.sourceTableName));
Table view;
try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) {
view = new Table(client.get().getTable(this.dbName, viewName));
}
HiveDataset viewDataset = new HiveDataset(FileSystem.getLocal(new Configuration()), pool, view, new Properties());
WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(viewDataset, HiveConverterUtils.StorageFormat.AVRO, new TableLikeStageableTableMetadata(viewDataset.getTable(), this.dbName, destinationTable, tmpDir.getAbsolutePath()), null);
HiveMaterializer hiveMaterializer = new HiveMaterializer(getTaskContextForRun(workUnit));
hiveMaterializer.run();
Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
hiveMaterializer.commit();
Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
List<List<String>> allTable = executeStatementAndGetResults(this.jdbcConnector, String.format("SELECT * FROM %s.%s", this.dbName, destinationTable), 3);
Assert.assertEquals(allTable.size(), 4);
Assert.assertEquals(allTable.stream().map(l -> l.get(0)).collect(Collectors.toList()), Lists.newArrayList("101", "103", "201", "203"));
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HiveMaterializerTest method testMaterializeQuery.
@Test
public void testMaterializeQuery() throws Exception {
String destinationTable = "materializeQuery";
File tmpDir = Files.createTempDir();
tmpDir.deleteOnExit();
WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(String.format("SELECT * FROM %s.%s WHERE name = 'foo'", this.dbName, this.sourceTableName), HiveConverterUtils.StorageFormat.AVRO, new TableLikeStageableTableMetadata(this.dataset.getTable(), this.dbName, destinationTable, tmpDir.getAbsolutePath()));
HiveMaterializer hiveMaterializer = new HiveMaterializer(getTaskContextForRun(workUnit));
hiveMaterializer.run();
Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
hiveMaterializer.commit();
Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL);
List<List<String>> allTable = executeStatementAndGetResults(this.jdbcConnector, String.format("SELECT * FROM %s.%s", this.dbName, destinationTable), 3);
Assert.assertEquals(allTable.size(), 4);
Assert.assertEquals(allTable.stream().map(l -> l.get(0)).collect(Collectors.toList()), Lists.newArrayList("101", "103", "201", "203"));
}
Aggregations