Search in sources :

Example 6 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class BFSPathFinder method findPathUnicast.

/**
 * A simple path finding algorithm based on Breadth-First Search. At every step the algorithm adds the adjacent {@link FlowEdge}s
 * to a queue. The {@link FlowEdge}s whose output {@link DatasetDescriptor} matches the destDatasetDescriptor are
 * added first to the queue. This ensures that dataset transformations are always performed closest to the source.
 * @return a path of {@link FlowEdgeContext}s starting at the srcNode and ending at the destNode.
 */
public List<FlowEdgeContext> findPathUnicast(DataNode destNode) {
    // Initialization of auxiliary data structures used for path computation
    this.pathMap = new HashMap<>();
    // Base condition 1: Source Node or Dest Node is inactive; return null
    if (!srcNode.isActive() || !destNode.isActive()) {
        log.warn("Either source node {} or destination node {} is inactive; skipping path computation.", this.srcNode.getId(), destNode.getId());
        return null;
    }
    // Base condition 2: Check if we are already at the target. If so, return an empty path.
    if ((srcNode.equals(destNode)) && destDatasetDescriptor.contains(srcDatasetDescriptor)) {
        return new ArrayList<>(0);
    }
    LinkedList<FlowEdgeContext> edgeQueue = new LinkedList<>(getNextEdges(srcNode, srcDatasetDescriptor, destDatasetDescriptor));
    for (FlowEdgeContext flowEdgeContext : edgeQueue) {
        this.pathMap.put(flowEdgeContext, flowEdgeContext);
    }
    // If the edge E' satisfies 1 and 2, add it to the edge queue for further consideration.
    while (!edgeQueue.isEmpty()) {
        FlowEdgeContext flowEdgeContext = edgeQueue.pop();
        DataNode currentNode = this.flowGraph.getNode(flowEdgeContext.getEdge().getDest());
        DatasetDescriptor currentOutputDatasetDescriptor = flowEdgeContext.getOutputDatasetDescriptor();
        // Are we done?
        if (isPathFound(currentNode, destNode, currentOutputDatasetDescriptor, destDatasetDescriptor)) {
            return constructPath(flowEdgeContext);
        }
        // Expand the currentNode to its adjacent edges and add them to the queue.
        List<FlowEdgeContext> nextEdges = getNextEdges(currentNode, currentOutputDatasetDescriptor, destDatasetDescriptor);
        for (FlowEdgeContext childFlowEdgeContext : nextEdges) {
            // queue.
            if (!this.pathMap.containsKey(childFlowEdgeContext)) {
                edgeQueue.add(childFlowEdgeContext);
                this.pathMap.put(childFlowEdgeContext, flowEdgeContext);
            }
        }
    }
    // No path found. Return null.
    return null;
}
Also used : FlowEdgeContext(org.apache.gobblin.service.modules.flow.FlowEdgeContext) DatasetDescriptor(org.apache.gobblin.service.modules.dataset.DatasetDescriptor) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList)

Example 7 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class MultiHopFlowCompilerTest method testGitFlowGraphMonitorService.

@Test(dependsOnMethods = "testMissingDestinationNodeError")
public void testGitFlowGraphMonitorService() throws IOException, GitAPIException, URISyntaxException, InterruptedException {
    File remoteDir = new File(TESTDIR + "/remote");
    File cloneDir = new File(TESTDIR + "/clone");
    File flowGraphDir = new File(cloneDir, "/gobblin-flowgraph");
    // Clean up
    cleanUpDir(TESTDIR);
    // Create a bare repository
    RepositoryCache.FileKey fileKey = RepositoryCache.FileKey.exact(remoteDir, FS.DETECTED);
    Repository remoteRepo = fileKey.open(false);
    remoteRepo.create(true);
    Git gitForPush = Git.cloneRepository().setURI(remoteRepo.getDirectory().getAbsolutePath()).setDirectory(cloneDir).call();
    // push an empty commit as a base for detecting changes
    gitForPush.commit().setMessage("First commit").call();
    RefSpec masterRefSpec = new RefSpec("master");
    gitForPush.push().setRemote("origin").setRefSpecs(masterRefSpec).call();
    URI flowTemplateCatalogUri = this.getClass().getClassLoader().getResource("template_catalog").toURI();
    Config config = ConfigBuilder.create().addPrimitive(GitFlowGraphMonitor.GIT_FLOWGRAPH_MONITOR_PREFIX + "." + ConfigurationKeys.GIT_MONITOR_REPO_URI, remoteRepo.getDirectory().getAbsolutePath()).addPrimitive(GitFlowGraphMonitor.GIT_FLOWGRAPH_MONITOR_PREFIX + "." + ConfigurationKeys.GIT_MONITOR_REPO_DIR, TESTDIR + "/git-flowgraph").addPrimitive(GitFlowGraphMonitor.GIT_FLOWGRAPH_MONITOR_PREFIX + "." + ConfigurationKeys.GIT_MONITOR_POLLING_INTERVAL, 5).addPrimitive(ServiceConfigKeys.TEMPLATE_CATALOGS_FULLY_QUALIFIED_PATH_KEY, flowTemplateCatalogUri.toString()).build();
    // Create a MultiHopFlowCompiler instance
    specCompiler = new MultiHopFlowCompiler(config, Optional.absent(), false);
    specCompiler.setActive(true);
    // Ensure node1 is not present in the graph
    Assert.assertNull(specCompiler.getFlowGraph().getNode("node1"));
    // push a new node file
    File nodeDir = new File(flowGraphDir, "node1");
    File nodeFile = new File(nodeDir, "node1.properties");
    nodeDir.mkdirs();
    nodeFile.createNewFile();
    Files.write(FlowGraphConfigurationKeys.DATA_NODE_IS_ACTIVE_KEY + "=true\nparam1=val1" + "\n", nodeFile, Charsets.UTF_8);
    // add, commit, push node
    gitForPush.add().addFilepattern(formNodeFilePath(flowGraphDir, nodeDir.getName(), nodeFile.getName())).call();
    gitForPush.commit().setMessage("Node commit").call();
    gitForPush.push().setRemote("origin").setRefSpecs(masterRefSpec).call();
    // polling is every 5 seconds, so wait twice as long and check
    TimeUnit.SECONDS.sleep(10);
    // Test that a DataNode is added to FlowGraph
    DataNode dataNode = specCompiler.getFlowGraph().getNode("node1");
    Assert.assertEquals(dataNode.getId(), "node1");
    Assert.assertEquals(dataNode.getRawConfig().getString("param1"), "val1");
}
Also used : Repository(org.eclipse.jgit.lib.Repository) Git(org.eclipse.jgit.api.Git) RefSpec(org.eclipse.jgit.transport.RefSpec) AzkabanProjectConfig(org.apache.gobblin.service.modules.orchestration.AzkabanProjectConfig) Config(com.typesafe.config.Config) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) RepositoryCache(org.eclipse.jgit.lib.RepositoryCache) File(java.io.File) URI(java.net.URI) Test(org.testng.annotations.Test)

Example 8 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class GitFlowGraphMonitorTest method testAddNode.

@Test
public void testAddNode() throws IOException, GitAPIException {
    String file1Contents = FlowGraphConfigurationKeys.DATA_NODE_IS_ACTIVE_KEY + "=true\nparam1=value1\n";
    String file2Contents = FlowGraphConfigurationKeys.DATA_NODE_IS_ACTIVE_KEY + "=true\nparam2=value2\n";
    addNode(this.node1Dir, this.node1File, file1Contents);
    addNode(this.node2Dir, this.node2File, file2Contents);
    this.gitFlowGraphMonitor.processGitConfigChanges();
    for (int i = 0; i < 1; i++) {
        String nodeId = "node" + (i + 1);
        String paramKey = "param" + (i + 1);
        String paramValue = "value" + (i + 1);
        // Check if nodes have been added to the FlowGraph
        DataNode dataNode = this.flowGraph.getNode(nodeId);
        Assert.assertEquals(dataNode.getId(), nodeId);
        Assert.assertTrue(dataNode.isActive());
        Assert.assertEquals(dataNode.getRawConfig().getString(paramKey), paramValue);
    }
}
Also used : DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) Test(org.testng.annotations.Test) MultiHopFlowCompilerTest(org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)

Example 9 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class GitFlowGraphMonitorTest method testChangesReorder.

@Test(dependsOnMethods = "testRemoveNode")
public void testChangesReorder() throws GitAPIException, IOException, ExecutionException, InterruptedException {
    String node1FileContents = FlowGraphConfigurationKeys.DATA_NODE_IS_ACTIVE_KEY + "=true\nparam1=value1\n";
    String node2FileContents = FlowGraphConfigurationKeys.DATA_NODE_IS_ACTIVE_KEY + "=true\nparam2=value2\n";
    String edgeFileContents = buildEdgeFileContents("node1", "node2", "edge1", "value1");
    createNewFile(this.node1Dir, this.node1File, node1FileContents);
    createNewFile(this.node2Dir, this.node2File, node2FileContents);
    createNewFile(this.edge1Dir, this.edge1File, edgeFileContents);
    // add, commit, push
    this.gitForPush.add().addFilepattern(formNodeFilePath(this.node1Dir.getName(), this.node1File.getName())).call();
    this.gitForPush.add().addFilepattern(formNodeFilePath(this.node2Dir.getName(), this.node2File.getName())).call();
    this.gitForPush.commit().setMessage("Add nodes commit").call();
    this.gitForPush.push().setRemote("origin").setRefSpecs(this.masterRefSpec).call();
    this.gitForPush.add().addFilepattern(formEdgeFilePath(this.edge1Dir.getParentFile().getName(), this.edge1Dir.getName(), this.edge1File.getName())).call();
    this.gitForPush.commit().setMessage("Add nodes and edges commit").call();
    this.gitForPush.push().setRemote("origin").setRefSpecs(this.masterRefSpec).call();
    this.gitFlowGraphMonitor.processGitConfigChanges();
    // Ensure node1 and node2 are present in the graph
    DataNode node1 = this.flowGraph.getNode("node1");
    Assert.assertNotNull(node1);
    DataNode node2 = this.flowGraph.getNode("node2");
    Assert.assertNotNull(node2);
    testIfEdgeSuccessfullyAdded("node1", "node2", "edge1", "value1");
    // Delete node1, edge node1->node2 files
    node1File.delete();
    edge1File.delete();
    // Commit1: delete node1 and edge node1->node2
    this.gitForPush.rm().addFilepattern(formNodeFilePath(this.node1Dir.getName(), this.node1File.getName())).call();
    this.gitForPush.rm().addFilepattern(formEdgeFilePath(this.edge1Dir.getParentFile().getName(), this.edge1Dir.getName(), this.edge1File.getName())).call();
    this.gitForPush.commit().setMessage("Delete node1 and edge1 commit").call();
    this.gitForPush.push().setRemote("origin").setRefSpecs(this.masterRefSpec).call();
    // Commit2: add node1 back
    createNewFile(this.node1Dir, this.node1File, node1FileContents);
    this.gitForPush.add().addFilepattern(formNodeFilePath(this.node1Dir.getName(), this.node1File.getName())).call();
    this.gitForPush.commit().setMessage("Add node1 commit").call();
    this.gitForPush.push().setRemote("origin").setRefSpecs(this.masterRefSpec).call();
    this.gitFlowGraphMonitor.processGitConfigChanges();
    node1 = this.flowGraph.getNode("node1");
    Assert.assertNotNull(node1);
    Assert.assertEquals(this.flowGraph.getEdges(node1).size(), 0);
}
Also used : DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) Test(org.testng.annotations.Test) MultiHopFlowCompilerTest(org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)

Example 10 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class GitFlowGraphMonitorTest method testUpdateNode.

@Test(dependsOnMethods = "testUpdateEdge")
public void testUpdateNode() throws IOException, GitAPIException, URISyntaxException, ExecutionException, InterruptedException {
    // Update param1 value in node1 and check if updated node is added to the graph
    String fileContents = FlowGraphConfigurationKeys.DATA_NODE_IS_ACTIVE_KEY + "=true\nparam1=value3\n";
    addNode(this.node1Dir, this.node1File, fileContents);
    this.gitFlowGraphMonitor.processGitConfigChanges();
    // Check if node has been updated in the FlowGraph
    DataNode dataNode = this.flowGraph.getNode("node1");
    Assert.assertEquals(dataNode.getId(), "node1");
    Assert.assertTrue(dataNode.isActive());
    Assert.assertEquals(dataNode.getRawConfig().getString("param1"), "value3");
}
Also used : DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) Test(org.testng.annotations.Test) MultiHopFlowCompilerTest(org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)

Aggregations

DataNode (org.apache.gobblin.service.modules.flowgraph.DataNode)12 Config (com.typesafe.config.Config)6 Test (org.testng.annotations.Test)5 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 SpecExecutor (org.apache.gobblin.runtime.api.SpecExecutor)3 FlowEdgeContext (org.apache.gobblin.service.modules.flow.FlowEdgeContext)3 MultiHopFlowCompilerTest (org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)3 Path (org.apache.hadoop.fs.Path)3 URI (java.net.URI)2 URISyntaxException (java.net.URISyntaxException)2 LinkedList (java.util.LinkedList)2 BaseFlowGraph (org.apache.gobblin.service.modules.flowgraph.BaseFlowGraph)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Joiner (com.google.common.base.Joiner)1 Optional (com.google.common.base.Optional)1 Preconditions (com.google.common.base.Preconditions)1 Throwables (com.google.common.base.Throwables)1 Lists (com.google.common.collect.Lists)1 ServiceManager (com.google.common.util.concurrent.ServiceManager)1