use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.
the class BaseDataPublisher method addLineageInfo.
private void addLineageInfo(WorkUnitState state, int branchId) {
if (!this.lineageInfo.isPresent()) {
LOG.info("Will not add lineage info");
return;
}
// Final dataset descriptor
DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId);
List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId);
List<Descriptor> descriptors = new ArrayList<>();
if (partitions.size() == 0) {
// Report as dataset level lineage
descriptors.add(datasetDescriptor);
} else {
// Report as partition level lineage
for (PartitionDescriptor partition : partitions) {
descriptors.add(partition.copyWithNewDataset(datasetDescriptor));
}
}
this.lineageInfo.get().putDestination(descriptors, branchId, state);
}
use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.
the class LineageInfo method load.
/**
* Load all lineage info from a {@link State}
*
* @return A map from branch to its lineage info. If there is no destination info, return an empty map
*/
static Map<String, Set<LineageEventBuilder>> load(State state) {
String name = state.getProp(getKey(NAME_KEY));
Descriptor source = Descriptor.fromJson(state.getProp(getKey(LineageEventBuilder.SOURCE)));
String branchedPrefix = getKey(BRANCH, "");
Map<String, Set<LineageEventBuilder>> events = Maps.newHashMap();
if (source == null) {
return events;
}
for (Map.Entry<Object, Object> entry : state.getProperties().entrySet()) {
String key = entry.getKey().toString();
if (!key.startsWith(branchedPrefix)) {
continue;
}
String[] parts = key.substring(branchedPrefix.length()).split("\\.");
assert parts.length == 2;
String branchId = parts[0];
Set<LineageEventBuilder> branchEvents = events.computeIfAbsent(branchId, k -> new HashSet<>());
switch(parts[1]) {
case LineageEventBuilder.DESTINATION:
List<Descriptor> descriptors = Descriptor.fromJsonList(entry.getValue().toString());
for (Descriptor descriptor : descriptors) {
LineageEventBuilder event = new LineageEventBuilder(name);
event.setSource(source);
event.setDestination(descriptor);
branchEvents.add(event);
}
break;
default:
throw new RuntimeException("Unsupported lineage key: " + key);
}
}
return events;
}
use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.
the class PartitionedDataWriter method serializePartitionInfoToState.
/**
* Serialize partitions info to {@link #state} if they are any
*/
private void serializePartitionInfoToState() {
List<PartitionDescriptor> descriptors = new ArrayList<>();
for (DataWriter writer : partitionWriters.asMap().values()) {
Descriptor descriptor = writer.getDataDescriptor();
if (null == descriptor) {
log.warn("Drop partition info as writer {} returns a null PartitionDescriptor", writer.toString());
continue;
}
if (!(descriptor instanceof PartitionDescriptor)) {
log.warn("Drop partition info as writer {} does not return a PartitionDescriptor", writer.toString());
continue;
}
descriptors.add((PartitionDescriptor) descriptor);
}
if (descriptors.size() > 0) {
state.setProp(getPartitionsKey(branchId), PartitionDescriptor.toPartitionJsonList(descriptors));
} else {
log.info("Partitions info not available. Will not serialize partitions");
}
}
use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.
the class LineageInfo method putDestination.
/**
* Put data {@link Descriptor}s of a destination dataset to a state
*
* @param descriptors It can be a single item list which just has the dataset descriptor or a list
* of dataset partition descriptors
*/
public void putDestination(List<Descriptor> descriptors, int branchId, State state) {
if (!hasLineageInfo(state)) {
log.warn("State has no lineage info but branch " + branchId + " puts {} descriptors", descriptors.size());
return;
}
log.debug(String.format("Put destination %s for branch %d", Descriptor.toJson(descriptors), branchId));
synchronized (state.getProp(getKey(NAME_KEY))) {
List<Descriptor> resolvedDescriptors = new ArrayList<>();
for (Descriptor descriptor : descriptors) {
Descriptor resolvedDescriptor = resolver.resolve(descriptor, state);
if (resolvedDescriptor == null) {
continue;
}
resolvedDescriptors.add(resolvedDescriptor);
}
String destinationKey = getDestinationKey(branchId);
String currentDestinations = state.getProp(destinationKey);
List<Descriptor> allDescriptors = Lists.newArrayList();
if (StringUtils.isNotEmpty(currentDestinations)) {
allDescriptors = Descriptor.fromJsonList(currentDestinations);
}
allDescriptors.addAll(resolvedDescriptors);
state.setProp(destinationKey, Descriptor.toJson(allDescriptors));
}
}
use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.
the class LineageInfo method setSource.
/**
* Set source {@link DatasetDescriptor} of a lineage event
*
* <p>
* Only the {@link org.apache.gobblin.source.Source} or its {@link org.apache.gobblin.source.extractor.Extractor}
* is supposed to set the source for a work unit of a dataset
* </p>
*
* @param state state about a {@link org.apache.gobblin.source.workunit.WorkUnit}
*/
public void setSource(Descriptor source, State state) {
Descriptor descriptor = resolver.resolve(source, state);
if (descriptor == null) {
return;
}
state.setProp(getKey(NAME_KEY), descriptor.getName());
state.setProp(getKey(LineageEventBuilder.SOURCE), Descriptor.toJson(descriptor));
}
Aggregations