Search in sources :

Example 1 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class GitFlowGraphMonitor method addDataNode.

/**
 * Add a {@link DataNode} to the {@link FlowGraph}. The method uses the {@link FlowGraphConfigurationKeys#DATA_NODE_CLASS} config
 * to instantiate a {@link DataNode} from the node config file.
 * @param change
 */
private void addDataNode(DiffEntry change) {
    if (checkFilePath(change.getNewPath(), NODE_FILE_DEPTH)) {
        Path nodeFilePath = new Path(this.repositoryDir, change.getNewPath());
        try {
            Config config = loadNodeFileWithOverrides(nodeFilePath);
            Class dataNodeClass = Class.forName(ConfigUtils.getString(config, FlowGraphConfigurationKeys.DATA_NODE_CLASS, FlowGraphConfigurationKeys.DEFAULT_DATA_NODE_CLASS));
            DataNode dataNode = (DataNode) GobblinConstructorUtils.invokeLongestConstructor(dataNodeClass, config);
            if (!this.flowGraph.addDataNode(dataNode)) {
                log.warn("Could not add DataNode {} to FlowGraph; skipping", dataNode.getId());
            } else {
                log.info("Added Datanode {} to FlowGraph", dataNode.getId());
            }
        } catch (Exception e) {
            log.warn("Could not add DataNode defined in {} due to exception {}", change.getNewPath(), e);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Config(com.typesafe.config.Config) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) URISyntaxException(java.net.URISyntaxException) GitAPIException(org.eclipse.jgit.api.errors.GitAPIException) IOException(java.io.IOException)

Example 2 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class MultiHopFlowCompilerTest method setUp.

@BeforeClass
public void setUp() throws URISyntaxException, IOException, ReflectiveOperationException, FlowEdgeFactory.FlowEdgeCreationException {
    // Create a FlowGraph
    this.flowGraph = new BaseFlowGraph();
    // Add DataNodes to the graph from the node properties files
    URI dataNodesUri = MultiHopFlowCompilerTest.class.getClassLoader().getResource("flowgraph/datanodes").toURI();
    FileSystem fs = FileSystem.get(dataNodesUri, new Configuration());
    Path dataNodesPath = new Path(dataNodesUri);
    ConfigParseOptions options = ConfigParseOptions.defaults().setSyntax(ConfigSyntax.PROPERTIES).setAllowMissing(false);
    for (FileStatus fileStatus : fs.listStatus(dataNodesPath)) {
        try (InputStream is = fs.open(fileStatus.getPath())) {
            Config nodeConfig = ConfigFactory.parseReader(new InputStreamReader(is, Charsets.UTF_8), options);
            Class dataNodeClass = Class.forName(ConfigUtils.getString(nodeConfig, FlowGraphConfigurationKeys.DATA_NODE_CLASS, FlowGraphConfigurationKeys.DEFAULT_DATA_NODE_CLASS));
            DataNode dataNode = (DataNode) GobblinConstructorUtils.invokeLongestConstructor(dataNodeClass, nodeConfig);
            this.flowGraph.addDataNode(dataNode);
        }
    }
    URI specExecutorCatalogUri = this.getClass().getClassLoader().getResource("topologyspec_catalog").toURI();
    Map<URI, TopologySpec> topologySpecMap = buildTopologySpecMap(specExecutorCatalogUri);
    // Create a FSFlowTemplateCatalog instance
    URI flowTemplateCatalogUri = this.getClass().getClassLoader().getResource("template_catalog").toURI();
    Properties properties = new Properties();
    properties.put(ServiceConfigKeys.TEMPLATE_CATALOGS_FULLY_QUALIFIED_PATH_KEY, flowTemplateCatalogUri.toString());
    Config config = ConfigFactory.parseProperties(properties);
    Config templateCatalogCfg = config.withValue(ConfigurationKeys.JOB_CONFIG_FILE_GENERAL_PATH_KEY, config.getValue(ServiceConfigKeys.TEMPLATE_CATALOGS_FULLY_QUALIFIED_PATH_KEY));
    FSFlowTemplateCatalog flowCatalog = new FSFlowTemplateCatalog(templateCatalogCfg);
    // Add FlowEdges from the edge properties files
    URI flowEdgesURI = MultiHopFlowCompilerTest.class.getClassLoader().getResource("flowgraph/flowedges").toURI();
    fs = FileSystem.get(flowEdgesURI, new Configuration());
    Path flowEdgesPath = new Path(flowEdgesURI);
    for (FileStatus fileStatus : fs.listStatus(flowEdgesPath)) {
        log.warn(fileStatus.getPath().toString());
        try (InputStream is = fs.open(fileStatus.getPath())) {
            Config flowEdgeConfig = ConfigFactory.parseReader(new InputStreamReader(is, Charsets.UTF_8), options);
            Class flowEdgeFactoryClass = Class.forName(ConfigUtils.getString(flowEdgeConfig, FlowGraphConfigurationKeys.FLOW_EDGE_FACTORY_CLASS, FlowGraphConfigurationKeys.DEFAULT_FLOW_EDGE_FACTORY_CLASS));
            FlowEdgeFactory flowEdgeFactory = (FlowEdgeFactory) GobblinConstructorUtils.invokeLongestConstructor(flowEdgeFactoryClass, config);
            List<String> specExecutorNames = ConfigUtils.getStringList(flowEdgeConfig, FlowGraphConfigurationKeys.FLOW_EDGE_SPEC_EXECUTORS_KEY);
            List<SpecExecutor> specExecutors = new ArrayList<>();
            for (String specExecutorName : specExecutorNames) {
                specExecutors.add(topologySpecMap.get(new URI(specExecutorName)).getSpecExecutor());
            }
            FlowEdge edge = flowEdgeFactory.createFlowEdge(flowEdgeConfig, flowCatalog, specExecutors);
            this.flowGraph.addFlowEdge(edge);
        }
    }
    this.specCompiler = new MultiHopFlowCompiler(config, this.flowGraph);
}
Also used : Path(org.apache.hadoop.fs.Path) FSFlowTemplateCatalog(org.apache.gobblin.service.modules.template_catalog.FSFlowTemplateCatalog) FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) InputStreamReader(java.io.InputStreamReader) InputStream(java.io.InputStream) AzkabanProjectConfig(org.apache.gobblin.service.modules.orchestration.AzkabanProjectConfig) Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) BaseFlowGraph(org.apache.gobblin.service.modules.flowgraph.BaseFlowGraph) Properties(java.util.Properties) URI(java.net.URI) TopologySpec(org.apache.gobblin.runtime.api.TopologySpec) FlowEdgeFactory(org.apache.gobblin.service.modules.flowgraph.FlowEdgeFactory) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) FileSystem(org.apache.hadoop.fs.FileSystem) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) AbstractSpecExecutor(org.apache.gobblin.runtime.spec_executorInstance.AbstractSpecExecutor) BeforeClass(org.testng.annotations.BeforeClass) AfterClass(org.testng.annotations.AfterClass) ConfigParseOptions(com.typesafe.config.ConfigParseOptions) BeforeClass(org.testng.annotations.BeforeClass)

Example 3 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class GitFlowGraphMonitorTest method testRemoveNode.

@Test(dependsOnMethods = "testRemoveEdge")
public void testRemoveNode() throws GitAPIException, IOException {
    // delete node files
    node1File.delete();
    node2File.delete();
    // Ensure node1 and node2 are present in the graph before delete
    DataNode node1 = this.flowGraph.getNode("node1");
    Assert.assertNotNull(node1);
    DataNode node2 = this.flowGraph.getNode("node2");
    Assert.assertNotNull(node2);
    // delete, commit, push
    this.gitForPush.rm().addFilepattern(formNodeFilePath(this.node1Dir.getName(), this.node1File.getName())).call();
    this.gitForPush.rm().addFilepattern(formNodeFilePath(this.node2Dir.getName(), this.node2File.getName())).call();
    this.gitForPush.commit().setMessage("Node remove commit").call();
    this.gitForPush.push().setRemote("origin").setRefSpecs(this.masterRefSpec).call();
    this.gitFlowGraphMonitor.processGitConfigChanges();
    // Check if node1 and node 2 have been deleted from the graph
    node1 = this.flowGraph.getNode("node1");
    Assert.assertNull(node1);
    node2 = this.flowGraph.getNode("node2");
    Assert.assertNull(node2);
}
Also used : DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) Test(org.testng.annotations.Test) MultiHopFlowCompilerTest(org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)

Example 4 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class AbstractPathFinder method findPath.

@Override
public FlowGraphPath findPath() throws PathFinderException {
    FlowGraphPath flowGraphPath = new FlowGraphPath(flowSpec, flowExecutionId);
    // flow graph.
    for (DataNode destNode : this.destNodes) {
        List<FlowEdgeContext> path = findPathUnicast(destNode);
        if (path != null) {
            log.info("Path to destination node {} found for flow {}. Path - {}", destNode.getId(), flowSpec.getUri(), path);
            flowGraphPath.addPath(path);
        } else {
            log.error("Path to destination node {} could not be found for flow {}.", destNode.getId(), flowSpec.getUri());
            // No path to at least one of the destination nodes.
            return null;
        }
    }
    return flowGraphPath;
}
Also used : FlowEdgeContext(org.apache.gobblin.service.modules.flow.FlowEdgeContext) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) FlowGraphPath(org.apache.gobblin.service.modules.flow.FlowGraphPath)

Example 5 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class MultiHopFlowCompiler method compileFlow.

/**
 * j
 * @param spec an instance of {@link FlowSpec}.
 * @return A DAG of {@link JobExecutionPlan}s, which encapsulates the compiled {@link org.apache.gobblin.runtime.api.JobSpec}s
 * together with the {@link SpecExecutor} where the job can be executed.
 */
@Override
public Dag<JobExecutionPlan> compileFlow(Spec spec) {
    Preconditions.checkNotNull(spec);
    Preconditions.checkArgument(spec instanceof FlowSpec, "MultiHopFlowCompiler only accepts FlowSpecs");
    long startTime = System.nanoTime();
    FlowSpec flowSpec = (FlowSpec) spec;
    String source = ConfigUtils.getString(flowSpec.getConfig(), ServiceConfigKeys.FLOW_SOURCE_IDENTIFIER_KEY, "");
    String destination = ConfigUtils.getString(flowSpec.getConfig(), ServiceConfigKeys.FLOW_DESTINATION_IDENTIFIER_KEY, "");
    DataNode sourceNode = this.flowGraph.getNode(source);
    if (sourceNode == null) {
        flowSpec.addCompilationError(source, destination, String.format("Flowgraph does not have a node with id %s", source));
        return null;
    }
    List<String> destNodeIds = ConfigUtils.getStringList(flowSpec.getConfig(), ServiceConfigKeys.FLOW_DESTINATION_IDENTIFIER_KEY);
    List<DataNode> destNodes = destNodeIds.stream().map(this.flowGraph::getNode).collect(Collectors.toList());
    if (destNodes.contains(null)) {
        flowSpec.addCompilationError(source, destination, String.format("Flowgraph does not have a node with id %s", destNodeIds.get(destNodes.indexOf(null))));
        return null;
    }
    log.info(String.format("Compiling flow for source: %s and destination: %s", source, destination));
    List<FlowSpec> flowSpecs = splitFlowSpec(flowSpec);
    Dag<JobExecutionPlan> jobExecutionPlanDag = new Dag<>(new ArrayList<>());
    try {
        this.rwLock.readLock().lock();
        for (FlowSpec datasetFlowSpec : flowSpecs) {
            for (DataNode destNode : destNodes) {
                long authStartTime = System.nanoTime();
                try {
                    boolean authorized = this.dataMovementAuthorizer.isMovementAuthorized(flowSpec, sourceNode, destNode);
                    Instrumented.updateTimer(dataAuthorizationTimer, System.nanoTime() - authStartTime, TimeUnit.NANOSECONDS);
                    if (!authorized) {
                        String message = String.format("Data movement is not authorized for flow: %s, source: %s, destination: %s", flowSpec.getUri().toString(), source, destination);
                        log.error(message);
                        datasetFlowSpec.addCompilationError(source, destination, message);
                        return null;
                    }
                } catch (Exception e) {
                    Instrumented.markMeter(flowCompilationFailedMeter);
                    datasetFlowSpec.addCompilationError(source, destination, Throwables.getStackTraceAsString(e));
                    return null;
                }
            }
            // Compute the path from source to destination.
            FlowGraphPath flowGraphPath = flowGraph.findPath(datasetFlowSpec);
            if (flowGraphPath != null) {
                // Convert the path into a Dag of JobExecutionPlans.
                jobExecutionPlanDag = jobExecutionPlanDag.merge(flowGraphPath.asDag(this.config));
            }
        }
        if (jobExecutionPlanDag.isEmpty()) {
            Instrumented.markMeter(flowCompilationFailedMeter);
            String message = String.format("No path found from source: %s and destination: %s", source, destination);
            log.info(message);
            if (!flowSpec.getCompilationErrors().stream().anyMatch(compilationError -> compilationError.errorPriority == 0)) {
                flowSpec.addCompilationError(source, destination, message);
            }
            return null;
        }
    } catch (PathFinder.PathFinderException | SpecNotFoundException | JobTemplate.TemplateException | URISyntaxException | ReflectiveOperationException e) {
        Instrumented.markMeter(flowCompilationFailedMeter);
        String message = String.format("Exception encountered while compiling flow for source: %s and destination: %s, %s", source, destination, Throwables.getStackTraceAsString(e));
        log.error(message, e);
        flowSpec.addCompilationError(source, destination, message);
        return null;
    } finally {
        this.rwLock.readLock().unlock();
    }
    Instrumented.markMeter(flowCompilationSuccessFulMeter);
    Instrumented.updateTimer(flowCompilationTimer, System.nanoTime() - startTime, TimeUnit.NANOSECONDS);
    return jobExecutionPlanDag;
}
Also used : DatasetDescriptorConfigKeys(org.apache.gobblin.service.modules.flowgraph.DatasetDescriptorConfigKeys) Getter(lombok.Getter) ObservingFSFlowEdgeTemplateCatalog(org.apache.gobblin.service.modules.template_catalog.ObservingFSFlowEdgeTemplateCatalog) ServiceManager(com.google.common.util.concurrent.ServiceManager) URISyntaxException(java.net.URISyntaxException) TimeoutException(java.util.concurrent.TimeoutException) ConfigValueFactory(com.typesafe.config.ConfigValueFactory) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) ConfigUtils(org.apache.gobblin.util.ConfigUtils) StringUtils(org.apache.commons.lang3.StringUtils) ArrayList(java.util.ArrayList) PathFinder(org.apache.gobblin.service.modules.flowgraph.pathfinder.PathFinder) Lists(com.google.common.collect.Lists) Optional(com.google.common.base.Optional) Path(org.apache.hadoop.fs.Path) JobTemplate(org.apache.gobblin.runtime.api.JobTemplate) BaseFlowGraph(org.apache.gobblin.service.modules.flowgraph.BaseFlowGraph) ServiceConfigKeys(org.apache.gobblin.service.ServiceConfigKeys) ReadWriteLock(java.util.concurrent.locks.ReadWriteLock) Spec(org.apache.gobblin.runtime.api.Spec) Logger(org.slf4j.Logger) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) Dag(org.apache.gobblin.service.modules.flowgraph.Dag) Config(com.typesafe.config.Config) ClassAliasResolver(org.apache.gobblin.util.ClassAliasResolver) Instrumented(org.apache.gobblin.instrumented.Instrumented) Throwables(com.google.common.base.Throwables) IOException(java.io.IOException) ConfigurationKeys(org.apache.gobblin.configuration.ConfigurationKeys) Collectors(java.util.stream.Collectors) InvocationTargetException(java.lang.reflect.InvocationTargetException) TimeUnit(java.util.concurrent.TimeUnit) Alpha(org.apache.gobblin.annotation.Alpha) SpecNotFoundException(org.apache.gobblin.runtime.api.SpecNotFoundException) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) FlowGraph(org.apache.gobblin.service.modules.flowgraph.FlowGraph) Preconditions(com.google.common.base.Preconditions) ConstructorUtils(org.apache.commons.lang3.reflect.ConstructorUtils) VisibleForTesting(com.google.common.annotations.VisibleForTesting) JobExecutionPlan(org.apache.gobblin.service.modules.spec.JobExecutionPlan) Joiner(com.google.common.base.Joiner) FlowSpec(org.apache.gobblin.runtime.api.FlowSpec) GitFlowGraphMonitor(org.apache.gobblin.service.modules.core.GitFlowGraphMonitor) JobExecutionPlan(org.apache.gobblin.service.modules.spec.JobExecutionPlan) Dag(org.apache.gobblin.service.modules.flowgraph.Dag) URISyntaxException(java.net.URISyntaxException) URISyntaxException(java.net.URISyntaxException) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) InvocationTargetException(java.lang.reflect.InvocationTargetException) SpecNotFoundException(org.apache.gobblin.runtime.api.SpecNotFoundException) SpecNotFoundException(org.apache.gobblin.runtime.api.SpecNotFoundException) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) FlowSpec(org.apache.gobblin.runtime.api.FlowSpec)

Aggregations

DataNode (org.apache.gobblin.service.modules.flowgraph.DataNode)12 Config (com.typesafe.config.Config)6 Test (org.testng.annotations.Test)5 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 SpecExecutor (org.apache.gobblin.runtime.api.SpecExecutor)3 FlowEdgeContext (org.apache.gobblin.service.modules.flow.FlowEdgeContext)3 MultiHopFlowCompilerTest (org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)3 Path (org.apache.hadoop.fs.Path)3 URI (java.net.URI)2 URISyntaxException (java.net.URISyntaxException)2 LinkedList (java.util.LinkedList)2 BaseFlowGraph (org.apache.gobblin.service.modules.flowgraph.BaseFlowGraph)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Joiner (com.google.common.base.Joiner)1 Optional (com.google.common.base.Optional)1 Preconditions (com.google.common.base.Preconditions)1 Throwables (com.google.common.base.Throwables)1 Lists (com.google.common.collect.Lists)1 ServiceManager (com.google.common.util.concurrent.ServiceManager)1