use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class BaseDataPublisherTest method testPublishMultiTasks.
@Test
public void testPublishMultiTasks() throws IOException {
WorkUnitState state1 = buildTaskState(2);
WorkUnitState state2 = buildTaskState(2);
LineageInfo lineageInfo = LineageInfo.getLineageInfo(state1.getTaskBroker()).get();
DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic");
lineageInfo.setSource(source, state1);
lineageInfo.setSource(source, state2);
BaseDataPublisher publisher = new BaseDataPublisher(state1);
publisher.publishData(ImmutableList.of(state1, state2));
Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.0.destination"));
Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.1.destination"));
Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.0.destination"));
Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.1.destination"));
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class ConvertibleHiveDatasetTest method testLineageInfo.
@Test
public void testLineageInfo() throws Exception {
String testConfFilePath = "convertibleHiveDatasetTest/flattenedAndNestedOrc.conf";
Config config = ConfigFactory.parseResources(testConfFilePath).getConfig("hive.conversion.avro");
WorkUnit workUnit = WorkUnit.createEmpty();
Gson GSON = new Gson();
HiveSource.setLineageInfo(createTestConvertibleDataset(config), workUnit, getSharedJobBroker());
Properties props = workUnit.getSpecProperties();
// Asset that lineage name is correct
Assert.assertEquals(props.getProperty("gobblin.event.lineage.name"), "db1.tb1");
// Assert that source is correct for lineage event
Assert.assertTrue(props.containsKey("gobblin.event.lineage.source"));
DatasetDescriptor sourceDD = GSON.fromJson(props.getProperty("gobblin.event.lineage.source"), DatasetDescriptor.class);
Assert.assertEquals(sourceDD.getPlatform(), DatasetConstants.PLATFORM_HIVE);
Assert.assertEquals(sourceDD.getName(), "db1.tb1");
// Assert that first dest is correct for lineage event
Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.1.destination"));
DatasetDescriptor destDD1 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.1.destination"), DatasetDescriptor.class);
Assert.assertEquals(destDD1.getPlatform(), DatasetConstants.PLATFORM_HIVE);
Assert.assertEquals(destDD1.getName(), "db1_nestedOrcDb.tb1_nestedOrc");
// Assert that second dest is correct for lineage event
Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.2.destination"));
DatasetDescriptor destDD2 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.2.destination"), DatasetDescriptor.class);
Assert.assertEquals(destDD2.getPlatform(), DatasetConstants.PLATFORM_HIVE);
Assert.assertEquals(destDD2.getName(), "db1_flattenedOrcDb.tb1_flattenedOrc");
// Assert that there are two eventBuilders for nestedOrc and flattenedOrc
Collection<LineageEventBuilder> lineageEventBuilders = LineageInfo.load(Collections.singleton(workUnit));
Assert.assertEquals(lineageEventBuilders.size(), 2);
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class CopyableFileTest method testSetFsDatasets.
@Test
public void testSetFsDatasets() throws URISyntaxException {
FileSystem originFs = mock(FileSystem.class);
String originFsUri = "hdfs://source.company.biz:2000";
String originPath = "/data/databases/source/profile";
when(originFs.getUri()).thenReturn(new URI(originFsUri));
when(originFs.getScheme()).thenReturn("hdfs");
FileSystem targetFs = mock(FileSystem.class);
String targetFsUri = "file:///";
String destinationPath = "/data/databases/destination/profile";
when(targetFs.getUri()).thenReturn(new URI(targetFsUri));
when(targetFs.getScheme()).thenReturn("file");
// Test when source file is not a directory
FileStatus origin = new FileStatus(0l, false, 0, 0l, 0l, new Path(originPath));
CopyableFile copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "");
copyableFile.setFsDatasets(originFs, targetFs);
DatasetDescriptor source = copyableFile.getSourceDataset();
Assert.assertEquals(source.getName(), "/data/databases/source");
Assert.assertEquals(source.getPlatform(), "hdfs");
Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri);
DatasetDescriptor destination = copyableFile.getDestinationDataset();
Assert.assertEquals(destination.getName(), "/data/databases/destination");
Assert.assertEquals(destination.getPlatform(), "file");
Assert.assertEquals(destination.getMetadata().get("fsUri"), targetFsUri);
// Test when source file is a directory
originPath = originFsUri + originPath;
destinationPath = targetFsUri + destinationPath;
origin = new FileStatus(0l, true, 0, 0l, 0l, new Path(originPath));
copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "");
copyableFile.setFsDatasets(originFs, targetFs);
source = copyableFile.getSourceDataset();
Assert.assertEquals(source.getName(), "/data/databases/source/profile");
Assert.assertEquals(source.getPlatform(), "hdfs");
Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri);
destination = copyableFile.getDestinationDataset();
Assert.assertEquals(destination.getName(), "/data/databases/destination/profile");
Assert.assertEquals(destination.getPlatform(), "file");
Assert.assertEquals(destination.getMetadata().get("fsUri"), targetFsUri);
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class HiveCopyEntityHelper method setCopyableFileDatasets.
/**
* Set the source and destination datasets of a copyable file
*/
void setCopyableFileDatasets(CopyableFile copyableFile) {
String sourceTable = dataset.getTable().getDbName() + "." + dataset.getTable().getTableName();
DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, sourceTable);
source.addMetadata(DatasetConstants.FS_URI, dataset.getFs().getUri().toString());
copyableFile.setSourceDataset(source);
String destinationTable = this.getTargetDatabase() + "." + this.getTargetTable();
DatasetDescriptor destination = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, destinationTable);
destination.addMetadata(DatasetConstants.FS_URI, this.getTargetFs().getUri().toString());
copyableFile.setDestinationDataset(destination);
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class TimePartitionedDataPublisher method createDestinationDescriptor.
@Override
protected DatasetDescriptor createDestinationDescriptor(WorkUnitState state, int branchId) {
// Get base descriptor
DatasetDescriptor descriptor = super.createDestinationDescriptor(state, branchId);
// Decorate with partition prefix
String propName = ForkOperatorUtils.getPropertyNameForBranch(TimeBasedWriterPartitioner.WRITER_PARTITION_PREFIX, numBranches, branchId);
String timePrefix = state.getProp(propName, "");
Path pathWithTimePrefix = new Path(descriptor.getName(), timePrefix);
DatasetDescriptor destination = new DatasetDescriptor(descriptor.getPlatform(), pathWithTimePrefix.toString());
// Add back the metadata
descriptor.getMetadata().forEach(destination::addMetadata);
return destination;
}
Aggregations