Search in sources :

Example 11 with DatasetDescriptor

use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class BaseDataPublisherTest method testPublishMultiTasks.

@Test
public void testPublishMultiTasks() throws IOException {
    WorkUnitState state1 = buildTaskState(2);
    WorkUnitState state2 = buildTaskState(2);
    LineageInfo lineageInfo = LineageInfo.getLineageInfo(state1.getTaskBroker()).get();
    DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic");
    lineageInfo.setSource(source, state1);
    lineageInfo.setSource(source, state2);
    BaseDataPublisher publisher = new BaseDataPublisher(state1);
    publisher.publishData(ImmutableList.of(state1, state2));
    Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.0.destination"));
    Assert.assertTrue(state1.contains("gobblin.event.lineage.branch.1.destination"));
    Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.0.destination"));
    Assert.assertTrue(state2.contains("gobblin.event.lineage.branch.1.destination"));
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) LineageInfo(org.apache.gobblin.metrics.event.lineage.LineageInfo) Test(org.testng.annotations.Test)

Example 12 with DatasetDescriptor

use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class ConvertibleHiveDatasetTest method testLineageInfo.

@Test
public void testLineageInfo() throws Exception {
    String testConfFilePath = "convertibleHiveDatasetTest/flattenedAndNestedOrc.conf";
    Config config = ConfigFactory.parseResources(testConfFilePath).getConfig("hive.conversion.avro");
    WorkUnit workUnit = WorkUnit.createEmpty();
    Gson GSON = new Gson();
    HiveSource.setLineageInfo(createTestConvertibleDataset(config), workUnit, getSharedJobBroker());
    Properties props = workUnit.getSpecProperties();
    // Asset that lineage name is correct
    Assert.assertEquals(props.getProperty("gobblin.event.lineage.name"), "db1.tb1");
    // Assert that source is correct for lineage event
    Assert.assertTrue(props.containsKey("gobblin.event.lineage.source"));
    DatasetDescriptor sourceDD = GSON.fromJson(props.getProperty("gobblin.event.lineage.source"), DatasetDescriptor.class);
    Assert.assertEquals(sourceDD.getPlatform(), DatasetConstants.PLATFORM_HIVE);
    Assert.assertEquals(sourceDD.getName(), "db1.tb1");
    // Assert that first dest is correct for lineage event
    Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.1.destination"));
    DatasetDescriptor destDD1 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.1.destination"), DatasetDescriptor.class);
    Assert.assertEquals(destDD1.getPlatform(), DatasetConstants.PLATFORM_HIVE);
    Assert.assertEquals(destDD1.getName(), "db1_nestedOrcDb.tb1_nestedOrc");
    // Assert that second dest is correct for lineage event
    Assert.assertTrue(props.containsKey("gobblin.event.lineage.branch.2.destination"));
    DatasetDescriptor destDD2 = GSON.fromJson(props.getProperty("gobblin.event.lineage.branch.2.destination"), DatasetDescriptor.class);
    Assert.assertEquals(destDD2.getPlatform(), DatasetConstants.PLATFORM_HIVE);
    Assert.assertEquals(destDD2.getName(), "db1_flattenedOrcDb.tb1_flattenedOrc");
    // Assert that there are two eventBuilders for nestedOrc and flattenedOrc
    Collection<LineageEventBuilder> lineageEventBuilders = LineageInfo.load(Collections.singleton(workUnit));
    Assert.assertEquals(lineageEventBuilders.size(), 2);
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) Config(com.typesafe.config.Config) ConversionConfig(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset.ConversionConfig) Gson(com.google.gson.Gson) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) LineageEventBuilder(org.apache.gobblin.metrics.event.lineage.LineageEventBuilder) Properties(java.util.Properties) Test(org.testng.annotations.Test)

Example 13 with DatasetDescriptor

use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class CopyableFileTest method testSetFsDatasets.

@Test
public void testSetFsDatasets() throws URISyntaxException {
    FileSystem originFs = mock(FileSystem.class);
    String originFsUri = "hdfs://source.company.biz:2000";
    String originPath = "/data/databases/source/profile";
    when(originFs.getUri()).thenReturn(new URI(originFsUri));
    when(originFs.getScheme()).thenReturn("hdfs");
    FileSystem targetFs = mock(FileSystem.class);
    String targetFsUri = "file:///";
    String destinationPath = "/data/databases/destination/profile";
    when(targetFs.getUri()).thenReturn(new URI(targetFsUri));
    when(targetFs.getScheme()).thenReturn("file");
    // Test when source file is not a directory
    FileStatus origin = new FileStatus(0l, false, 0, 0l, 0l, new Path(originPath));
    CopyableFile copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "");
    copyableFile.setFsDatasets(originFs, targetFs);
    DatasetDescriptor source = copyableFile.getSourceDataset();
    Assert.assertEquals(source.getName(), "/data/databases/source");
    Assert.assertEquals(source.getPlatform(), "hdfs");
    Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri);
    DatasetDescriptor destination = copyableFile.getDestinationDataset();
    Assert.assertEquals(destination.getName(), "/data/databases/destination");
    Assert.assertEquals(destination.getPlatform(), "file");
    Assert.assertEquals(destination.getMetadata().get("fsUri"), targetFsUri);
    // Test when source file is a directory
    originPath = originFsUri + originPath;
    destinationPath = targetFsUri + destinationPath;
    origin = new FileStatus(0l, true, 0, 0l, 0l, new Path(originPath));
    copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "");
    copyableFile.setFsDatasets(originFs, targetFs);
    source = copyableFile.getSourceDataset();
    Assert.assertEquals(source.getName(), "/data/databases/source/profile");
    Assert.assertEquals(source.getPlatform(), "hdfs");
    Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri);
    destination = copyableFile.getDestinationDataset();
    Assert.assertEquals(destination.getName(), "/data/databases/destination/profile");
    Assert.assertEquals(destination.getPlatform(), "file");
    Assert.assertEquals(destination.getMetadata().get("fsUri"), targetFsUri);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) FileSystem(org.apache.hadoop.fs.FileSystem) URI(java.net.URI) Test(org.testng.annotations.Test)

Example 14 with DatasetDescriptor

use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class HiveCopyEntityHelper method setCopyableFileDatasets.

/**
 * Set the source and destination datasets of a copyable file
 */
void setCopyableFileDatasets(CopyableFile copyableFile) {
    String sourceTable = dataset.getTable().getDbName() + "." + dataset.getTable().getTableName();
    DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, sourceTable);
    source.addMetadata(DatasetConstants.FS_URI, dataset.getFs().getUri().toString());
    copyableFile.setSourceDataset(source);
    String destinationTable = this.getTargetDatabase() + "." + this.getTargetTable();
    DatasetDescriptor destination = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, destinationTable);
    destination.addMetadata(DatasetConstants.FS_URI, this.getTargetFs().getUri().toString());
    copyableFile.setDestinationDataset(destination);
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) ToString(lombok.ToString)

Example 15 with DatasetDescriptor

use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class TimePartitionedDataPublisher method createDestinationDescriptor.

@Override
protected DatasetDescriptor createDestinationDescriptor(WorkUnitState state, int branchId) {
    // Get base descriptor
    DatasetDescriptor descriptor = super.createDestinationDescriptor(state, branchId);
    // Decorate with partition prefix
    String propName = ForkOperatorUtils.getPropertyNameForBranch(TimeBasedWriterPartitioner.WRITER_PARTITION_PREFIX, numBranches, branchId);
    String timePrefix = state.getProp(propName, "");
    Path pathWithTimePrefix = new Path(descriptor.getName(), timePrefix);
    DatasetDescriptor destination = new DatasetDescriptor(descriptor.getPlatform(), pathWithTimePrefix.toString());
    // Add back the metadata
    descriptor.getMetadata().forEach(destination::addMetadata);
    return destination;
}
Also used : Path(org.apache.hadoop.fs.Path) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor)

Aggregations

DatasetDescriptor (org.apache.gobblin.dataset.DatasetDescriptor)16 Test (org.testng.annotations.Test)6 Path (org.apache.hadoop.fs.Path)5 LineageInfo (org.apache.gobblin.metrics.event.lineage.LineageInfo)4 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)3 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)3 Gson (com.google.gson.Gson)2 SourceState (org.apache.gobblin.configuration.SourceState)2 State (org.apache.gobblin.configuration.State)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 Config (com.typesafe.config.Config)1 URI (java.net.URI)1 Map (java.util.Map)1 Properties (java.util.Properties)1 ToString (lombok.ToString)1 ConversionConfig (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset.ConversionConfig)1 LineageEventBuilder (org.apache.gobblin.metrics.event.lineage.LineageEventBuilder)1 QueryBasedSource (org.apache.gobblin.source.extractor.extract.QueryBasedSource)1 Extract (org.apache.gobblin.source.workunit.Extract)1