Search in sources :

Example 1 with Descriptor

use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.

the class BaseDataPublisher method addLineageInfo.

private void addLineageInfo(WorkUnitState state, int branchId) {
    if (!this.lineageInfo.isPresent()) {
        LOG.info("Will not add lineage info");
        return;
    }
    // Final dataset descriptor
    DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId);
    List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId);
    List<Descriptor> descriptors = new ArrayList<>();
    if (partitions.size() == 0) {
        // Report as dataset level lineage
        descriptors.add(datasetDescriptor);
    } else {
        // Report as partition level lineage
        for (PartitionDescriptor partition : partitions) {
            descriptors.add(partition.copyWithNewDataset(datasetDescriptor));
        }
    }
    this.lineageInfo.get().putDestination(descriptors, branchId, state);
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) ArrayList(java.util.ArrayList) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Descriptor(org.apache.gobblin.dataset.Descriptor) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor)

Example 2 with Descriptor

use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.

the class LineageInfo method load.

/**
 * Load all lineage info from a {@link State}
 *
 * @return A map from branch to its lineage info. If there is no destination info, return an empty map
 */
static Map<String, Set<LineageEventBuilder>> load(State state) {
    String name = state.getProp(getKey(NAME_KEY));
    Descriptor source = Descriptor.fromJson(state.getProp(getKey(LineageEventBuilder.SOURCE)));
    String branchedPrefix = getKey(BRANCH, "");
    Map<String, Set<LineageEventBuilder>> events = Maps.newHashMap();
    if (source == null) {
        return events;
    }
    for (Map.Entry<Object, Object> entry : state.getProperties().entrySet()) {
        String key = entry.getKey().toString();
        if (!key.startsWith(branchedPrefix)) {
            continue;
        }
        String[] parts = key.substring(branchedPrefix.length()).split("\\.");
        assert parts.length == 2;
        String branchId = parts[0];
        Set<LineageEventBuilder> branchEvents = events.computeIfAbsent(branchId, k -> new HashSet<>());
        switch(parts[1]) {
            case LineageEventBuilder.DESTINATION:
                List<Descriptor> descriptors = Descriptor.fromJsonList(entry.getValue().toString());
                for (Descriptor descriptor : descriptors) {
                    LineageEventBuilder event = new LineageEventBuilder(name);
                    event.setSource(source);
                    event.setDestination(descriptor);
                    branchEvents.add(event);
                }
                break;
            default:
                throw new RuntimeException("Unsupported lineage key: " + key);
        }
    }
    return events;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) Descriptor(org.apache.gobblin.dataset.Descriptor) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 3 with Descriptor

use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.

the class PartitionedDataWriter method serializePartitionInfoToState.

/**
 * Serialize partitions info to {@link #state} if they are any
 */
private void serializePartitionInfoToState() {
    List<PartitionDescriptor> descriptors = new ArrayList<>();
    for (DataWriter writer : partitionWriters.asMap().values()) {
        Descriptor descriptor = writer.getDataDescriptor();
        if (null == descriptor) {
            log.warn("Drop partition info as writer {} returns a null PartitionDescriptor", writer.toString());
            continue;
        }
        if (!(descriptor instanceof PartitionDescriptor)) {
            log.warn("Drop partition info as writer {} does not return a PartitionDescriptor", writer.toString());
            continue;
        }
        descriptors.add((PartitionDescriptor) descriptor);
    }
    if (descriptors.size() > 0) {
        state.setProp(getPartitionsKey(branchId), PartitionDescriptor.toPartitionJsonList(descriptors));
    } else {
        log.info("Partitions info not available. Will not serialize partitions");
    }
}
Also used : ArrayList(java.util.ArrayList) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Descriptor(org.apache.gobblin.dataset.Descriptor) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor)

Example 4 with Descriptor

use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.

the class LineageInfo method putDestination.

/**
 * Put data {@link Descriptor}s of a destination dataset to a state
 *
 * @param descriptors It can be a single item list which just has the dataset descriptor or a list
 *                    of dataset partition descriptors
 */
public void putDestination(List<Descriptor> descriptors, int branchId, State state) {
    if (!hasLineageInfo(state)) {
        log.warn("State has no lineage info but branch " + branchId + " puts {} descriptors", descriptors.size());
        return;
    }
    log.debug(String.format("Put destination %s for branch %d", Descriptor.toJson(descriptors), branchId));
    synchronized (state.getProp(getKey(NAME_KEY))) {
        List<Descriptor> resolvedDescriptors = new ArrayList<>();
        for (Descriptor descriptor : descriptors) {
            Descriptor resolvedDescriptor = resolver.resolve(descriptor, state);
            if (resolvedDescriptor == null) {
                continue;
            }
            resolvedDescriptors.add(resolvedDescriptor);
        }
        String destinationKey = getDestinationKey(branchId);
        String currentDestinations = state.getProp(destinationKey);
        List<Descriptor> allDescriptors = Lists.newArrayList();
        if (StringUtils.isNotEmpty(currentDestinations)) {
            allDescriptors = Descriptor.fromJsonList(currentDestinations);
        }
        allDescriptors.addAll(resolvedDescriptors);
        state.setProp(destinationKey, Descriptor.toJson(allDescriptors));
    }
}
Also used : ArrayList(java.util.ArrayList) Descriptor(org.apache.gobblin.dataset.Descriptor) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor)

Example 5 with Descriptor

use of org.apache.gobblin.dataset.Descriptor in project incubator-gobblin by apache.

the class LineageInfo method setSource.

/**
 * Set source {@link DatasetDescriptor} of a lineage event
 *
 * <p>
 *   Only the {@link org.apache.gobblin.source.Source} or its {@link org.apache.gobblin.source.extractor.Extractor}
 *   is supposed to set the source for a work unit of a dataset
 * </p>
 *
 * @param state state about a {@link org.apache.gobblin.source.workunit.WorkUnit}
 */
public void setSource(Descriptor source, State state) {
    Descriptor descriptor = resolver.resolve(source, state);
    if (descriptor == null) {
        return;
    }
    state.setProp(getKey(NAME_KEY), descriptor.getName());
    state.setProp(getKey(LineageEventBuilder.SOURCE), Descriptor.toJson(descriptor));
}
Also used : Descriptor(org.apache.gobblin.dataset.Descriptor) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor)

Aggregations

Descriptor (org.apache.gobblin.dataset.Descriptor)5 DatasetDescriptor (org.apache.gobblin.dataset.DatasetDescriptor)4 ArrayList (java.util.ArrayList)3 PartitionDescriptor (org.apache.gobblin.dataset.PartitionDescriptor)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Set (java.util.Set)1