use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class HiveSource method setLineageInfo.
public static void setLineageInfo(ConvertibleHiveDataset convertibleHiveDataset, WorkUnit workUnit, SharedResourcesBroker<GobblinScopeTypes> sharedJobBroker) throws IOException {
String sourceTable = convertibleHiveDataset.getTable().getDbName() + "." + convertibleHiveDataset.getTable().getTableName();
DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, sourceTable);
source.addMetadata(DatasetConstants.FS_URI, convertibleHiveDataset.getTable().getDataLocation().getFileSystem(new Configuration()).getUri().toString());
int virtualBranch = 0;
for (String format : convertibleHiveDataset.getDestFormats()) {
++virtualBranch;
Optional<ConvertibleHiveDataset.ConversionConfig> conversionConfigForFormat = convertibleHiveDataset.getConversionConfigForFormat(format);
Optional<LineageInfo> lineageInfo = LineageInfo.getLineageInfo(sharedJobBroker);
if (!lineageInfo.isPresent()) {
continue;
} else if (!conversionConfigForFormat.isPresent()) {
continue;
}
String destTable = conversionConfigForFormat.get().getDestinationDbName() + "." + conversionConfigForFormat.get().getDestinationTableName();
DatasetDescriptor dest = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, destTable);
Path destPath = new Path(conversionConfigForFormat.get().getDestinationDataPath());
dest.addMetadata(DatasetConstants.FS_URI, destPath.getFileSystem(new Configuration()).getUri().toString());
lineageInfo.get().setSource(source, workUnit);
lineageInfo.get().putDestination(dest, virtualBranch, workUnit);
}
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class LineageInfo method setSource.
/**
* Set source {@link DatasetDescriptor} of a lineage event
*
* <p>
* Only the {@link org.apache.gobblin.source.Source} or its {@link org.apache.gobblin.source.extractor.Extractor}
* is supposed to set the source for a work unit of a dataset
* </p>
*
* @param state state about a {@link org.apache.gobblin.source.workunit.WorkUnit}
*/
public void setSource(DatasetDescriptor source, State state) {
DatasetDescriptor descriptor = resolver.resolve(source, state);
if (descriptor == null) {
return;
}
state.setProp(getKey(NAME_KEY), descriptor.getName());
state.setProp(getKey(LineageEventBuilder.SOURCE), GSON.toJson(descriptor));
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class LineageInfo method load.
/**
* Load all lineage info from a {@link State}
*
* @return A map from branch to its lineage info. If there is no destination info, return an empty map
*/
static Map<String, LineageEventBuilder> load(State state) {
String name = state.getProp(getKey(NAME_KEY));
DatasetDescriptor source = GSON.fromJson(state.getProp(getKey(LineageEventBuilder.SOURCE)), DatasetDescriptor.class);
String branchedPrefix = getKey(BRANCH, "");
Map<String, LineageEventBuilder> events = Maps.newHashMap();
for (Map.Entry<Object, Object> entry : state.getProperties().entrySet()) {
String key = entry.getKey().toString();
if (!key.startsWith(branchedPrefix)) {
continue;
}
String[] parts = key.substring(branchedPrefix.length()).split("\\.");
assert parts.length == 2;
String branchId = parts[0];
LineageEventBuilder event = events.get(branchId);
if (event == null) {
event = new LineageEventBuilder(name);
event.setSource(new DatasetDescriptor(source));
events.put(parts[0], event);
}
switch(parts[1]) {
case LineageEventBuilder.DESTINATION:
DatasetDescriptor destination = GSON.fromJson(entry.getValue().toString(), DatasetDescriptor.class);
event.setDestination(destination);
break;
default:
throw new RuntimeException("Unsupported lineage key: " + key);
}
}
return events;
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class LineageEventTest method testEvent.
@Test
public void testEvent() {
final String topic = "testTopic";
final String kafka = "kafka";
final String hdfs = "hdfs";
final String mysql = "mysql";
final String branch = "branch";
State state0 = new State();
LineageInfo lineageInfo = getLineageInfo();
DatasetDescriptor source = new DatasetDescriptor(kafka, topic);
lineageInfo.setSource(source, state0);
DatasetDescriptor destination00 = new DatasetDescriptor(hdfs, "/data/dbchanges");
destination00.addMetadata(branch, "0");
lineageInfo.putDestination(destination00, 0, state0);
DatasetDescriptor destination01 = new DatasetDescriptor(mysql, "kafka.testTopic");
destination01.addMetadata(branch, "1");
lineageInfo.putDestination(destination01, 1, state0);
Map<String, LineageEventBuilder> events = LineageInfo.load(state0);
verify(events.get("0"), topic, source, destination00);
verify(events.get("1"), topic, source, destination01);
State state1 = new State();
lineageInfo.setSource(source, state1);
List<State> states = Lists.newArrayList();
states.add(state0);
states.add(state1);
// Test only full fledged lineage events are loaded
Collection<LineageEventBuilder> eventsList = LineageInfo.load(states);
Assert.assertTrue(eventsList.size() == 2);
Assert.assertEquals(getLineageEvent(eventsList, 0, hdfs), events.get("0"));
Assert.assertEquals(getLineageEvent(eventsList, 1, mysql), events.get("1"));
// There are 3 full fledged lineage events
DatasetDescriptor destination12 = new DatasetDescriptor(mysql, "kafka.testTopic2");
destination12.addMetadata(branch, "2");
lineageInfo.putDestination(destination12, 2, state1);
eventsList = LineageInfo.load(states);
Assert.assertTrue(eventsList.size() == 3);
Assert.assertEquals(getLineageEvent(eventsList, 0, hdfs), events.get("0"));
Assert.assertEquals(getLineageEvent(eventsList, 1, mysql), events.get("1"));
verify(getLineageEvent(eventsList, 2, mysql), topic, source, destination12);
// There 5 lineage events put, but only 4 unique lineage events
DatasetDescriptor destination10 = destination12;
lineageInfo.putDestination(destination10, 0, state1);
DatasetDescriptor destination11 = new DatasetDescriptor("hive", "kafka.testTopic1");
destination11.addMetadata(branch, "1");
lineageInfo.putDestination(destination11, 1, state1);
eventsList = LineageInfo.load(states);
Assert.assertTrue(eventsList.size() == 4);
Assert.assertEquals(getLineageEvent(eventsList, 0, hdfs), events.get("0"));
Assert.assertEquals(getLineageEvent(eventsList, 1, mysql), events.get("1"));
// Either branch 0 or 2 of state 1 is selected
LineageEventBuilder event12 = getLineageEvent(eventsList, 0, mysql);
if (event12 == null) {
event12 = getLineageEvent(eventsList, 2, mysql);
}
verify(event12, topic, source, destination12);
verify(getLineageEvent(eventsList, 1, "hive"), topic, source, destination11);
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class BaseDataPublisher method createDestinationDescriptor.
protected DatasetDescriptor createDestinationDescriptor(WorkUnitState state, int branchId) {
Path publisherOutputDir = getPublisherOutputDir(state, branchId);
FileSystem fs = this.publisherFileSystemByBranches.get(branchId);
DatasetDescriptor destination = new DatasetDescriptor(fs.getScheme(), publisherOutputDir.toString());
destination.addMetadata(DatasetConstants.FS_URI, fs.getUri().toString());
destination.addMetadata(DatasetConstants.BRANCH, String.valueOf(branchId));
return destination;
}
Aggregations