Examples with DatasetDescriptor - org.apache.gobblin.service.modules.dataset.DatasetDescriptor

Example 1 with DatasetDescriptor

use of org.apache.gobblin.service.modules.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class BFSPathFinder method findPathUnicast.

/**
 * A simple path finding algorithm based on Breadth-First Search. At every step the algorithm adds the adjacent {@link FlowEdge}s
 * to a queue. The {@link FlowEdge}s whose output {@link DatasetDescriptor} matches the destDatasetDescriptor are
 * added first to the queue. This ensures that dataset transformations are always performed closest to the source.
 * @return a path of {@link FlowEdgeContext}s starting at the srcNode and ending at the destNode.
 */
public List<FlowEdgeContext> findPathUnicast(DataNode destNode) {
    // Initialization of auxiliary data structures used for path computation
    this.pathMap = new HashMap<>();
    // Base condition 1: Source Node or Dest Node is inactive; return null
    if (!srcNode.isActive() || !destNode.isActive()) {
        log.warn("Either source node {} or destination node {} is inactive; skipping path computation.", this.srcNode.getId(), destNode.getId());
        return null;
    }
    // Base condition 2: Check if we are already at the target. If so, return an empty path.
    if ((srcNode.equals(destNode)) && destDatasetDescriptor.contains(srcDatasetDescriptor)) {
        return new ArrayList<>(0);
    }
    LinkedList<FlowEdgeContext> edgeQueue = new LinkedList<>(getNextEdges(srcNode, srcDatasetDescriptor, destDatasetDescriptor));
    for (FlowEdgeContext flowEdgeContext : edgeQueue) {
        this.pathMap.put(flowEdgeContext, flowEdgeContext);
    }
    // If the edge E' satisfies 1 and 2, add it to the edge queue for further consideration.
    while (!edgeQueue.isEmpty()) {
        FlowEdgeContext flowEdgeContext = edgeQueue.pop();
        DataNode currentNode = this.flowGraph.getNode(flowEdgeContext.getEdge().getDest());
        DatasetDescriptor currentOutputDatasetDescriptor = flowEdgeContext.getOutputDatasetDescriptor();
        // Are we done?
        if (isPathFound(currentNode, destNode, currentOutputDatasetDescriptor, destDatasetDescriptor)) {
            return constructPath(flowEdgeContext);
        }
        // Expand the currentNode to its adjacent edges and add them to the queue.
        List<FlowEdgeContext> nextEdges = getNextEdges(currentNode, currentOutputDatasetDescriptor, destDatasetDescriptor);
        for (FlowEdgeContext childFlowEdgeContext : nextEdges) {
            // queue.
            if (!this.pathMap.containsKey(childFlowEdgeContext)) {
                edgeQueue.add(childFlowEdgeContext);
                this.pathMap.put(childFlowEdgeContext, flowEdgeContext);
            }
        }
    }
    // No path found. Return null.
    return null;
}

Also used : FlowEdgeContext(org.apache.gobblin.service.modules.flow.FlowEdgeContext) DatasetDescriptor(org.apache.gobblin.service.modules.dataset.DatasetDescriptor) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList)

Example 2 with DatasetDescriptor

use of org.apache.gobblin.service.modules.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class StaticFlowTemplate method getDatasetDescriptors.

/**
 * Generate the input/output dataset descriptors for the {@link FlowTemplate}.
 * @param userConfig User supplied Config
 * @param resolvable Whether to return only resolvable dataset descriptors
 * @return a List of Input/Output DatasetDescriptors that correspond to this {@link FlowTemplate}. If resolvable is true,
 * only return descriptors that fully resolve it.
 */
@Override
public List<Pair<DatasetDescriptor, DatasetDescriptor>> getDatasetDescriptors(Config userConfig, boolean resolvable) throws IOException {
    Config config = this.getResolvedFlowConfig(userConfig).resolve(ConfigResolveOptions.defaults().setAllowUnresolved(true));
    if (!config.hasPath(DatasetDescriptorConfigKeys.FLOW_EDGE_INPUT_DATASET_DESCRIPTOR_PREFIX) || !config.hasPath(DatasetDescriptorConfigKeys.FLOW_EDGE_OUTPUT_DATASET_DESCRIPTOR_PREFIX)) {
        throw new IOException("Flow template must specify at least one input/output dataset descriptor");
    }
    int i = 0;
    String inputPrefix = Joiner.on(".").join(DatasetDescriptorConfigKeys.FLOW_EDGE_INPUT_DATASET_DESCRIPTOR_PREFIX, Integer.toString(i));
    List<Pair<DatasetDescriptor, DatasetDescriptor>> result = Lists.newArrayList();
    while (config.hasPath(inputPrefix)) {
        try {
            Config inputDescriptorConfig = config.getConfig(inputPrefix);
            DatasetDescriptor inputDescriptor = DatasetDescriptorUtils.constructDatasetDescriptor(inputDescriptorConfig);
            String outputPrefix = Joiner.on(".").join(DatasetDescriptorConfigKeys.FLOW_EDGE_OUTPUT_DATASET_DESCRIPTOR_PREFIX, Integer.toString(i));
            Config outputDescriptorConfig = config.getConfig(outputPrefix);
            DatasetDescriptor outputDescriptor = DatasetDescriptorUtils.constructDatasetDescriptor(outputDescriptorConfig);
            if (resolvable) {
                try {
                    tryResolving(userConfig, inputDescriptor, outputDescriptor);
                    result.add(ImmutablePair.of(inputDescriptor, outputDescriptor));
                } catch (JobTemplate.TemplateException | ConfigException | SpecNotFoundException e) {
                // Dataset descriptor cannot be resolved so don't add it to result
                }
            } else {
                result.add(ImmutablePair.of(inputDescriptor, outputDescriptor));
            }
        } catch (ReflectiveOperationException e) {
        // Cannot instantiate I/O dataset descriptor due to missing config; skip and try the next one.
        }
        inputPrefix = Joiner.on(".").join(DatasetDescriptorConfigKeys.FLOW_EDGE_INPUT_DATASET_DESCRIPTOR_PREFIX, Integer.toString(++i));
    }
    return result;
}

Also used : DatasetDescriptor(org.apache.gobblin.service.modules.dataset.DatasetDescriptor) SpecNotFoundException(org.apache.gobblin.runtime.api.SpecNotFoundException) Config(com.typesafe.config.Config) ConfigException(com.typesafe.config.ConfigException) IOException(java.io.IOException) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair)

Example 3 with DatasetDescriptor

use of org.apache.gobblin.service.modules.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class AbstractPathFinder method getNextEdges.

/**
 * A helper method that sorts the {@link FlowEdge}s incident on srcNode based on whether the FlowEdge has an
 * output {@link DatasetDescriptor} that is compatible with the targetDatasetDescriptor.
 * @param dataNode the {@link DataNode} to be expanded for determining candidate edges.
 * @param currentDatasetDescriptor Output {@link DatasetDescriptor} of the current edge.
 * @param destDatasetDescriptor Target {@link DatasetDescriptor}.
 * @return prioritized list of {@link FlowEdge}s to be added to the edge queue for expansion.
 */
List<FlowEdgeContext> getNextEdges(DataNode dataNode, DatasetDescriptor currentDatasetDescriptor, DatasetDescriptor destDatasetDescriptor) {
    List<FlowEdgeContext> prioritizedEdgeList = new LinkedList<>();
    List<String> edgeIds = ConfigUtils.getStringList(this.flowConfig, ConfigurationKeys.WHITELISTED_EDGE_IDS);
    for (FlowEdge flowEdge : this.flowGraph.getEdges(dataNode)) {
        if (!edgeIds.isEmpty() && !edgeIds.contains(flowEdge.getId())) {
            continue;
        }
        try {
            DataNode edgeDestination = this.flowGraph.getNode(flowEdge.getDest());
            // Base condition: Skip this FLowEdge, if it is inactive or if the destination of this edge is inactive.
            if (!edgeDestination.isActive() || !flowEdge.isActive()) {
                continue;
            }
            boolean foundExecutor = false;
            // Iterate over all executors for this edge. Find the first one that resolves the underlying flow template.
            for (SpecExecutor specExecutor : flowEdge.getExecutors()) {
                Config mergedConfig = getMergedConfig(flowEdge);
                List<Pair<DatasetDescriptor, DatasetDescriptor>> datasetDescriptorPairs = flowEdge.getFlowTemplate().getDatasetDescriptors(mergedConfig, false);
                for (Pair<DatasetDescriptor, DatasetDescriptor> datasetDescriptorPair : datasetDescriptorPairs) {
                    DatasetDescriptor inputDatasetDescriptor = datasetDescriptorPair.getLeft();
                    DatasetDescriptor outputDatasetDescriptor = datasetDescriptorPair.getRight();
                    try {
                        flowEdge.getFlowTemplate().tryResolving(mergedConfig, datasetDescriptorPair.getLeft(), datasetDescriptorPair.getRight());
                    } catch (JobTemplate.TemplateException | ConfigException | SpecNotFoundException e) {
                        flowSpec.addCompilationError(flowEdge.getSrc(), flowEdge.getDest(), "Error compiling edge " + flowEdge.toString() + ": " + e.toString());
                        continue;
                    }
                    if (inputDatasetDescriptor.contains(currentDatasetDescriptor)) {
                        DatasetDescriptor edgeOutputDescriptor = makeOutputDescriptorSpecific(currentDatasetDescriptor, outputDatasetDescriptor);
                        FlowEdgeContext flowEdgeContext = new FlowEdgeContext(flowEdge, currentDatasetDescriptor, edgeOutputDescriptor, mergedConfig, specExecutor);
                        if (destDatasetDescriptor.getFormatConfig().contains(outputDatasetDescriptor.getFormatConfig())) {
                            /*
                Add to the front of the edge list if platform-independent properties of the output descriptor is compatible
                with those of destination dataset descriptor.
                In other words, we prioritize edges that perform data transformations as close to the source as possible.
                */
                            prioritizedEdgeList.add(0, flowEdgeContext);
                        } else {
                            prioritizedEdgeList.add(flowEdgeContext);
                        }
                        foundExecutor = true;
                    }
                }
                // TODO: Choose the min-cost executor for the FlowEdge as opposed to the first one that resolves.
                if (foundExecutor) {
                    break;
                }
            }
        } catch (IOException | ReflectiveOperationException | SpecNotFoundException | JobTemplate.TemplateException e) {
            // Skip the edge; and continue
            log.warn("Skipping edge {} with config {} due to exception: {}", flowEdge.getId(), flowConfig.toString(), e);
        }
    }
    return prioritizedEdgeList;
}

Also used : FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge) DatasetDescriptor(org.apache.gobblin.service.modules.dataset.DatasetDescriptor) Config(com.typesafe.config.Config) ConfigException(com.typesafe.config.ConfigException) IOException(java.io.IOException) LinkedList(java.util.LinkedList) FlowEdgeContext(org.apache.gobblin.service.modules.flow.FlowEdgeContext) SpecNotFoundException(org.apache.gobblin.runtime.api.SpecNotFoundException) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) Pair(org.apache.commons.lang3.tuple.Pair)

Example 4 with DatasetDescriptor

use of org.apache.gobblin.service.modules.dataset.DatasetDescriptor in project incubator-gobblin by apache.

the class FlowGraphPath method convertHopToDag.

/**
 * Given an instance of {@link FlowEdge}, this method returns a {@link Dag < JobExecutionPlan >} that moves data
 * from the source of the {@link FlowEdge} to the destination of the {@link FlowEdge}.
 * @param flowEdgeContext an instance of {@link FlowEdgeContext}.
 * @param sysConfig environment config.
 * @return a {@link Dag} of {@link JobExecutionPlan}s associated with the {@link FlowEdge}.
 */
private Dag<JobExecutionPlan> convertHopToDag(FlowEdgeContext flowEdgeContext, Config sysConfig) throws SpecNotFoundException, JobTemplate.TemplateException, URISyntaxException {
    FlowTemplate flowTemplate = flowEdgeContext.getEdge().getFlowTemplate();
    DatasetDescriptor inputDatasetDescriptor = flowEdgeContext.getInputDatasetDescriptor();
    DatasetDescriptor outputDatasetDescriptor = flowEdgeContext.getOutputDatasetDescriptor();
    Config mergedConfig = flowEdgeContext.getMergedConfig();
    SpecExecutor specExecutor = flowEdgeContext.getSpecExecutor();
    // Get resolved job configs from the flow template
    List<Config> resolvedJobConfigs = flowTemplate.getResolvedJobConfigs(mergedConfig, inputDatasetDescriptor, outputDatasetDescriptor);
    List<JobExecutionPlan> jobExecutionPlans = new ArrayList<>(resolvedJobConfigs.size());
    Map<String, String> templateToJobNameMap = Maps.newHashMapWithExpectedSize(resolvedJobConfigs.size());
    // Iterate over each resolved job config and convert the config to a JobSpec.
    for (Config resolvedJobConfig : resolvedJobConfigs) {
        JobExecutionPlan jobExecutionPlan = new JobExecutionPlan.Factory().createPlan(flowSpec, resolvedJobConfig, specExecutor, flowExecutionId, sysConfig);
        jobExecutionPlans.add(jobExecutionPlan);
        templateToJobNameMap.put(getJobTemplateName(jobExecutionPlan), jobExecutionPlan.getJobSpec().getConfig().getString(ConfigurationKeys.JOB_NAME_KEY));
    }
    updateJobDependencies(jobExecutionPlans, templateToJobNameMap);
    return new JobExecutionPlanDagFactory().createDag(jobExecutionPlans);
}

Also used : FlowTemplate(org.apache.gobblin.service.modules.template.FlowTemplate) JobExecutionPlan(org.apache.gobblin.service.modules.spec.JobExecutionPlan) DatasetDescriptor(org.apache.gobblin.service.modules.dataset.DatasetDescriptor) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) JobExecutionPlanDagFactory(org.apache.gobblin.service.modules.spec.JobExecutionPlanDagFactory)

Aggregations

DatasetDescriptor (org.apache.gobblin.service.modules.dataset.DatasetDescriptor)4 Config (com.typesafe.config.Config)3 ConfigException (com.typesafe.config.ConfigException)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 LinkedList (java.util.LinkedList)2 Pair (org.apache.commons.lang3.tuple.Pair)2 SpecExecutor (org.apache.gobblin.runtime.api.SpecExecutor)2 SpecNotFoundException (org.apache.gobblin.runtime.api.SpecNotFoundException)2 FlowEdgeContext (org.apache.gobblin.service.modules.flow.FlowEdgeContext)2 DataNode (org.apache.gobblin.service.modules.flowgraph.DataNode)2 ImmutablePair (org.apache.commons.lang3.tuple.ImmutablePair)1 FlowEdge (org.apache.gobblin.service.modules.flowgraph.FlowEdge)1 JobExecutionPlan (org.apache.gobblin.service.modules.spec.JobExecutionPlan)1 JobExecutionPlanDagFactory (org.apache.gobblin.service.modules.spec.JobExecutionPlanDagFactory)1 FlowTemplate (org.apache.gobblin.service.modules.template.FlowTemplate)1