Search in sources :

Example 1 with FlowEdge

use of org.apache.gobblin.service.modules.flowgraph.FlowEdge in project incubator-gobblin by apache.

the class GitFlowGraphMonitor method addFlowEdge.

/**
 * Add a {@link FlowEdge} to the {@link FlowGraph}. The method uses the {@link FlowEdgeFactory} instance
 * provided by the {@link FlowGraph} to build a {@link FlowEdge} from the edge config file.
 * @param change
 */
private void addFlowEdge(DiffEntry change) {
    if (checkFilePath(change.getNewPath(), EDGE_FILE_DEPTH)) {
        Path edgeFilePath = new Path(this.repositoryDir, change.getNewPath());
        try {
            Config edgeConfig = loadEdgeFileWithOverrides(edgeFilePath);
            List<SpecExecutor> specExecutors = getSpecExecutors(edgeConfig);
            Class flowEdgeFactoryClass = Class.forName(ConfigUtils.getString(edgeConfig, FlowGraphConfigurationKeys.FLOW_EDGE_FACTORY_CLASS, FlowGraphConfigurationKeys.DEFAULT_FLOW_EDGE_FACTORY_CLASS));
            FlowEdgeFactory flowEdgeFactory = (FlowEdgeFactory) GobblinConstructorUtils.invokeLongestConstructor(flowEdgeFactoryClass, edgeConfig);
            if (flowTemplateCatalog.isPresent()) {
                FlowEdge edge = flowEdgeFactory.createFlowEdge(edgeConfig, flowTemplateCatalog.get(), specExecutors);
                if (!this.flowGraph.addFlowEdge(edge)) {
                    log.warn("Could not add edge {} to FlowGraph; skipping", edge.getId());
                } else {
                    log.info("Added edge {} to FlowGraph", edge.getId());
                }
            } else {
                log.warn("Could not add edge defined in {} to FlowGraph as FlowTemplateCatalog is absent", change.getNewPath());
            }
        } catch (Exception e) {
            log.warn("Could not add edge defined in {} due to exception {}", change.getNewPath(), e.getMessage());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge) FlowEdgeFactory(org.apache.gobblin.service.modules.flowgraph.FlowEdgeFactory) Config(com.typesafe.config.Config) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) URISyntaxException(java.net.URISyntaxException) GitAPIException(org.eclipse.jgit.api.errors.GitAPIException) IOException(java.io.IOException)

Example 2 with FlowEdge

use of org.apache.gobblin.service.modules.flowgraph.FlowEdge in project incubator-gobblin by apache.

the class MultiHopFlowCompilerTest method setUp.

@BeforeClass
public void setUp() throws URISyntaxException, IOException, ReflectiveOperationException, FlowEdgeFactory.FlowEdgeCreationException {
    // Create a FlowGraph
    this.flowGraph = new BaseFlowGraph();
    // Add DataNodes to the graph from the node properties files
    URI dataNodesUri = MultiHopFlowCompilerTest.class.getClassLoader().getResource("flowgraph/datanodes").toURI();
    FileSystem fs = FileSystem.get(dataNodesUri, new Configuration());
    Path dataNodesPath = new Path(dataNodesUri);
    ConfigParseOptions options = ConfigParseOptions.defaults().setSyntax(ConfigSyntax.PROPERTIES).setAllowMissing(false);
    for (FileStatus fileStatus : fs.listStatus(dataNodesPath)) {
        try (InputStream is = fs.open(fileStatus.getPath())) {
            Config nodeConfig = ConfigFactory.parseReader(new InputStreamReader(is, Charsets.UTF_8), options);
            Class dataNodeClass = Class.forName(ConfigUtils.getString(nodeConfig, FlowGraphConfigurationKeys.DATA_NODE_CLASS, FlowGraphConfigurationKeys.DEFAULT_DATA_NODE_CLASS));
            DataNode dataNode = (DataNode) GobblinConstructorUtils.invokeLongestConstructor(dataNodeClass, nodeConfig);
            this.flowGraph.addDataNode(dataNode);
        }
    }
    URI specExecutorCatalogUri = this.getClass().getClassLoader().getResource("topologyspec_catalog").toURI();
    Map<URI, TopologySpec> topologySpecMap = buildTopologySpecMap(specExecutorCatalogUri);
    // Create a FSFlowTemplateCatalog instance
    URI flowTemplateCatalogUri = this.getClass().getClassLoader().getResource("template_catalog").toURI();
    Properties properties = new Properties();
    properties.put(ServiceConfigKeys.TEMPLATE_CATALOGS_FULLY_QUALIFIED_PATH_KEY, flowTemplateCatalogUri.toString());
    Config config = ConfigFactory.parseProperties(properties);
    Config templateCatalogCfg = config.withValue(ConfigurationKeys.JOB_CONFIG_FILE_GENERAL_PATH_KEY, config.getValue(ServiceConfigKeys.TEMPLATE_CATALOGS_FULLY_QUALIFIED_PATH_KEY));
    FSFlowTemplateCatalog flowCatalog = new FSFlowTemplateCatalog(templateCatalogCfg);
    // Add FlowEdges from the edge properties files
    URI flowEdgesURI = MultiHopFlowCompilerTest.class.getClassLoader().getResource("flowgraph/flowedges").toURI();
    fs = FileSystem.get(flowEdgesURI, new Configuration());
    Path flowEdgesPath = new Path(flowEdgesURI);
    for (FileStatus fileStatus : fs.listStatus(flowEdgesPath)) {
        log.warn(fileStatus.getPath().toString());
        try (InputStream is = fs.open(fileStatus.getPath())) {
            Config flowEdgeConfig = ConfigFactory.parseReader(new InputStreamReader(is, Charsets.UTF_8), options);
            Class flowEdgeFactoryClass = Class.forName(ConfigUtils.getString(flowEdgeConfig, FlowGraphConfigurationKeys.FLOW_EDGE_FACTORY_CLASS, FlowGraphConfigurationKeys.DEFAULT_FLOW_EDGE_FACTORY_CLASS));
            FlowEdgeFactory flowEdgeFactory = (FlowEdgeFactory) GobblinConstructorUtils.invokeLongestConstructor(flowEdgeFactoryClass, config);
            List<String> specExecutorNames = ConfigUtils.getStringList(flowEdgeConfig, FlowGraphConfigurationKeys.FLOW_EDGE_SPEC_EXECUTORS_KEY);
            List<SpecExecutor> specExecutors = new ArrayList<>();
            for (String specExecutorName : specExecutorNames) {
                specExecutors.add(topologySpecMap.get(new URI(specExecutorName)).getSpecExecutor());
            }
            FlowEdge edge = flowEdgeFactory.createFlowEdge(flowEdgeConfig, flowCatalog, specExecutors);
            this.flowGraph.addFlowEdge(edge);
        }
    }
    this.specCompiler = new MultiHopFlowCompiler(config, this.flowGraph);
}
Also used : Path(org.apache.hadoop.fs.Path) FSFlowTemplateCatalog(org.apache.gobblin.service.modules.template_catalog.FSFlowTemplateCatalog) FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) AzkabanProjectConfig(org.apache.gobblin.service.modules.orchestration.AzkabanProjectConfig) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) BaseFlowGraph(org.apache.gobblin.service.modules.flowgraph.BaseFlowGraph) Properties(java.util.Properties) URI(java.net.URI) TopologySpec(org.apache.gobblin.runtime.api.TopologySpec) FlowEdgeFactory(org.apache.gobblin.service.modules.flowgraph.FlowEdgeFactory) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) FileSystem(org.apache.hadoop.fs.FileSystem) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) AbstractSpecExecutor(org.apache.gobblin.runtime.spec_executorInstance.AbstractSpecExecutor) BeforeClass(org.testng.annotations.BeforeClass) AfterClass(org.testng.annotations.AfterClass) ConfigParseOptions(com.typesafe.config.ConfigParseOptions) BeforeClass(org.testng.annotations.BeforeClass)

Example 3 with FlowEdge

use of org.apache.gobblin.service.modules.flowgraph.FlowEdge in project incubator-gobblin by apache.

the class GitFlowGraphMonitorTest method testIfEdgeSuccessfullyAdded.

private void testIfEdgeSuccessfullyAdded(String node1, String node2, String edgeName, String value) throws ExecutionException, InterruptedException {
    Set<FlowEdge> edgeSet = this.flowGraph.getEdges(node1);
    Assert.assertEquals(edgeSet.size(), 1);
    FlowEdge flowEdge = edgeSet.iterator().next();
    Assert.assertEquals(flowEdge.getId(), Joiner.on("_").join(node1, node2, edgeName));
    Assert.assertEquals(flowEdge.getSrc(), node1);
    Assert.assertEquals(flowEdge.getDest(), node2);
    Assert.assertEquals(flowEdge.getExecutors().get(0).getConfig().get().getString("specStore.fs.dir"), "/tmp1");
    Assert.assertEquals(flowEdge.getExecutors().get(0).getConfig().get().getString("specExecInstance.capabilities"), "s1:d1");
    Assert.assertEquals(flowEdge.getExecutors().get(0).getClass().getSimpleName(), "InMemorySpecExecutor");
    Assert.assertEquals(flowEdge.getExecutors().get(1).getConfig().get().getString("specStore.fs.dir"), "/tmp2");
    Assert.assertEquals(flowEdge.getExecutors().get(1).getConfig().get().getString("specExecInstance.capabilities"), "s2:d2");
    Assert.assertEquals(flowEdge.getExecutors().get(1).getClass().getSimpleName(), "InMemorySpecExecutor");
    Assert.assertEquals(flowEdge.getConfig().getString("key1"), value);
}
Also used : FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge)

Example 4 with FlowEdge

use of org.apache.gobblin.service.modules.flowgraph.FlowEdge in project incubator-gobblin by apache.

the class GitFlowGraphMonitorTest method testRemoveEdge.

@Test(dependsOnMethods = "testUpdateNode")
public void testRemoveEdge() throws GitAPIException, IOException {
    // delete a config file
    edge1File.delete();
    // Node1 has 1 edge before delete
    Set<FlowEdge> edgeSet = this.flowGraph.getEdges("node1");
    Assert.assertEquals(edgeSet.size(), 1);
    // delete, commit, push
    DirCache ac = this.gitForPush.rm().addFilepattern(formEdgeFilePath(this.edge1Dir.getParentFile().getName(), this.edge1Dir.getName(), this.edge1File.getName())).call();
    RevCommit cc = this.gitForPush.commit().setMessage("Edge remove commit").call();
    this.gitForPush.push().setRemote("origin").setRefSpecs(this.masterRefSpec).call();
    this.gitFlowGraphMonitor.processGitConfigChanges();
    // Check if edge1 has been deleted from the graph
    edgeSet = this.flowGraph.getEdges("node1");
    Assert.assertTrue(edgeSet.size() == 0);
}
Also used : FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge) DirCache(org.eclipse.jgit.dircache.DirCache) RevCommit(org.eclipse.jgit.revwalk.RevCommit) Test(org.testng.annotations.Test) MultiHopFlowCompilerTest(org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)

Example 5 with FlowEdge

use of org.apache.gobblin.service.modules.flowgraph.FlowEdge in project incubator-gobblin by apache.

the class AbstractPathFinder method getNextEdges.

/**
 * A helper method that sorts the {@link FlowEdge}s incident on srcNode based on whether the FlowEdge has an
 * output {@link DatasetDescriptor} that is compatible with the targetDatasetDescriptor.
 * @param dataNode the {@link DataNode} to be expanded for determining candidate edges.
 * @param currentDatasetDescriptor Output {@link DatasetDescriptor} of the current edge.
 * @param destDatasetDescriptor Target {@link DatasetDescriptor}.
 * @return prioritized list of {@link FlowEdge}s to be added to the edge queue for expansion.
 */
List<FlowEdgeContext> getNextEdges(DataNode dataNode, DatasetDescriptor currentDatasetDescriptor, DatasetDescriptor destDatasetDescriptor) {
    List<FlowEdgeContext> prioritizedEdgeList = new LinkedList<>();
    List<String> edgeIds = ConfigUtils.getStringList(this.flowConfig, ConfigurationKeys.WHITELISTED_EDGE_IDS);
    for (FlowEdge flowEdge : this.flowGraph.getEdges(dataNode)) {
        if (!edgeIds.isEmpty() && !edgeIds.contains(flowEdge.getId())) {
            continue;
        }
        try {
            DataNode edgeDestination = this.flowGraph.getNode(flowEdge.getDest());
            // Base condition: Skip this FLowEdge, if it is inactive or if the destination of this edge is inactive.
            if (!edgeDestination.isActive() || !flowEdge.isActive()) {
                continue;
            }
            boolean foundExecutor = false;
            // Iterate over all executors for this edge. Find the first one that resolves the underlying flow template.
            for (SpecExecutor specExecutor : flowEdge.getExecutors()) {
                Config mergedConfig = getMergedConfig(flowEdge);
                List<Pair<DatasetDescriptor, DatasetDescriptor>> datasetDescriptorPairs = flowEdge.getFlowTemplate().getDatasetDescriptors(mergedConfig, false);
                for (Pair<DatasetDescriptor, DatasetDescriptor> datasetDescriptorPair : datasetDescriptorPairs) {
                    DatasetDescriptor inputDatasetDescriptor = datasetDescriptorPair.getLeft();
                    DatasetDescriptor outputDatasetDescriptor = datasetDescriptorPair.getRight();
                    try {
                        flowEdge.getFlowTemplate().tryResolving(mergedConfig, datasetDescriptorPair.getLeft(), datasetDescriptorPair.getRight());
                    } catch (JobTemplate.TemplateException | ConfigException | SpecNotFoundException e) {
                        flowSpec.addCompilationError(flowEdge.getSrc(), flowEdge.getDest(), "Error compiling edge " + flowEdge.toString() + ": " + e.toString());
                        continue;
                    }
                    if (inputDatasetDescriptor.contains(currentDatasetDescriptor)) {
                        DatasetDescriptor edgeOutputDescriptor = makeOutputDescriptorSpecific(currentDatasetDescriptor, outputDatasetDescriptor);
                        FlowEdgeContext flowEdgeContext = new FlowEdgeContext(flowEdge, currentDatasetDescriptor, edgeOutputDescriptor, mergedConfig, specExecutor);
                        if (destDatasetDescriptor.getFormatConfig().contains(outputDatasetDescriptor.getFormatConfig())) {
                            /*
                Add to the front of the edge list if platform-independent properties of the output descriptor is compatible
                with those of destination dataset descriptor.
                In other words, we prioritize edges that perform data transformations as close to the source as possible.
                */
                            prioritizedEdgeList.add(0, flowEdgeContext);
                        } else {
                            prioritizedEdgeList.add(flowEdgeContext);
                        }
                        foundExecutor = true;
                    }
                }
                // TODO: Choose the min-cost executor for the FlowEdge as opposed to the first one that resolves.
                if (foundExecutor) {
                    break;
                }
            }
        } catch (IOException | ReflectiveOperationException | SpecNotFoundException | JobTemplate.TemplateException e) {
            // Skip the edge; and continue
            log.warn("Skipping edge {} with config {} due to exception: {}", flowEdge.getId(), flowConfig.toString(), e);
        }
    }
    return prioritizedEdgeList;
}
Also used : FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge) DatasetDescriptor(org.apache.gobblin.service.modules.dataset.DatasetDescriptor) Config(com.typesafe.config.Config) ConfigException(com.typesafe.config.ConfigException) IOException(java.io.IOException) LinkedList(java.util.LinkedList) FlowEdgeContext(org.apache.gobblin.service.modules.flow.FlowEdgeContext) SpecNotFoundException(org.apache.gobblin.runtime.api.SpecNotFoundException) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) Pair(org.apache.commons.lang3.tuple.Pair)

Aggregations

FlowEdge (org.apache.gobblin.service.modules.flowgraph.FlowEdge)5 Config (com.typesafe.config.Config)3 SpecExecutor (org.apache.gobblin.runtime.api.SpecExecutor)3 IOException (java.io.IOException)2 DataNode (org.apache.gobblin.service.modules.flowgraph.DataNode)2 FlowEdgeFactory (org.apache.gobblin.service.modules.flowgraph.FlowEdgeFactory)2 Path (org.apache.hadoop.fs.Path)2 ConfigException (com.typesafe.config.ConfigException)1 ConfigParseOptions (com.typesafe.config.ConfigParseOptions)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 URI (java.net.URI)1 URISyntaxException (java.net.URISyntaxException)1 ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1 Properties (java.util.Properties)1 Pair (org.apache.commons.lang3.tuple.Pair)1 SpecNotFoundException (org.apache.gobblin.runtime.api.SpecNotFoundException)1 TopologySpec (org.apache.gobblin.runtime.api.TopologySpec)1 AbstractSpecExecutor (org.apache.gobblin.runtime.spec_executorInstance.AbstractSpecExecutor)1