Search in sources :

Example 11 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class SftpDataNodeTest method testCreate.

@Test
public void testCreate() throws DataNode.DataNodeCreationException {
    // Create a SFTP DataNode with default SFTP port
    Config config = ConfigFactory.empty().withValue(SftpDataNode.SFTP_HOSTNAME, ConfigValueFactory.fromAnyRef("testHost")).withValue(FlowGraphConfigurationKeys.DATA_NODE_ID_KEY, ConfigValueFactory.fromAnyRef("testId"));
    SftpDataNode dataNode = new SftpDataNode(config);
    Assert.assertEquals(dataNode.getId(), "testId");
    Assert.assertEquals(dataNode.getHostName(), "testHost");
    Assert.assertEquals(dataNode.getPort().intValue(), ConfigurationKeys.SOURCE_CONN_DEFAULT_PORT);
    Assert.assertEquals(dataNode.getDefaultDatasetDescriptorPlatform(), SftpDataNode.PLATFORM);
    Assert.assertEquals(dataNode.getDefaultDatasetDescriptorClass(), FSDatasetDescriptor.class.getCanonicalName());
    config = config.withValue(SftpDataNode.SFTP_PORT, ConfigValueFactory.fromAnyRef(143));
    SftpDataNode dataNodeWithPort = new SftpDataNode(config);
    Assert.assertEquals(dataNode.getId(), "testId");
    Assert.assertEquals(dataNode.getHostName(), "testHost");
    Assert.assertEquals(dataNodeWithPort.getPort().intValue(), 143);
    Assert.assertEquals(dataNode.getDefaultDatasetDescriptorPlatform(), SftpDataNode.PLATFORM);
    Assert.assertEquals(dataNode.getDefaultDatasetDescriptorClass(), FSDatasetDescriptor.class.getCanonicalName());
    Config configMissingProps = ConfigFactory.empty().withValue(FlowGraphConfigurationKeys.DATA_NODE_ID_KEY, ConfigValueFactory.fromAnyRef("testId"));
    try {
        DataNode sftpNode = new SftpDataNode(configMissingProps);
        Assert.fail("Unexpected success in creating Sftp node.");
    } catch (DataNode.DataNodeCreationException e) {
    // Expected exception.
    }
}
Also used : Config(com.typesafe.config.Config) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) FSDatasetDescriptor(org.apache.gobblin.service.modules.dataset.FSDatasetDescriptor) Test(org.testng.annotations.Test)

Example 12 with DataNode

use of org.apache.gobblin.service.modules.flowgraph.DataNode in project incubator-gobblin by apache.

the class AbstractPathFinder method getNextEdges.

/**
 * A helper method that sorts the {@link FlowEdge}s incident on srcNode based on whether the FlowEdge has an
 * output {@link DatasetDescriptor} that is compatible with the targetDatasetDescriptor.
 * @param dataNode the {@link DataNode} to be expanded for determining candidate edges.
 * @param currentDatasetDescriptor Output {@link DatasetDescriptor} of the current edge.
 * @param destDatasetDescriptor Target {@link DatasetDescriptor}.
 * @return prioritized list of {@link FlowEdge}s to be added to the edge queue for expansion.
 */
List<FlowEdgeContext> getNextEdges(DataNode dataNode, DatasetDescriptor currentDatasetDescriptor, DatasetDescriptor destDatasetDescriptor) {
    List<FlowEdgeContext> prioritizedEdgeList = new LinkedList<>();
    List<String> edgeIds = ConfigUtils.getStringList(this.flowConfig, ConfigurationKeys.WHITELISTED_EDGE_IDS);
    for (FlowEdge flowEdge : this.flowGraph.getEdges(dataNode)) {
        if (!edgeIds.isEmpty() && !edgeIds.contains(flowEdge.getId())) {
            continue;
        }
        try {
            DataNode edgeDestination = this.flowGraph.getNode(flowEdge.getDest());
            // Base condition: Skip this FLowEdge, if it is inactive or if the destination of this edge is inactive.
            if (!edgeDestination.isActive() || !flowEdge.isActive()) {
                continue;
            }
            boolean foundExecutor = false;
            // Iterate over all executors for this edge. Find the first one that resolves the underlying flow template.
            for (SpecExecutor specExecutor : flowEdge.getExecutors()) {
                Config mergedConfig = getMergedConfig(flowEdge);
                List<Pair<DatasetDescriptor, DatasetDescriptor>> datasetDescriptorPairs = flowEdge.getFlowTemplate().getDatasetDescriptors(mergedConfig, false);
                for (Pair<DatasetDescriptor, DatasetDescriptor> datasetDescriptorPair : datasetDescriptorPairs) {
                    DatasetDescriptor inputDatasetDescriptor = datasetDescriptorPair.getLeft();
                    DatasetDescriptor outputDatasetDescriptor = datasetDescriptorPair.getRight();
                    try {
                        flowEdge.getFlowTemplate().tryResolving(mergedConfig, datasetDescriptorPair.getLeft(), datasetDescriptorPair.getRight());
                    } catch (JobTemplate.TemplateException | ConfigException | SpecNotFoundException e) {
                        flowSpec.addCompilationError(flowEdge.getSrc(), flowEdge.getDest(), "Error compiling edge " + flowEdge.toString() + ": " + e.toString());
                        continue;
                    }
                    if (inputDatasetDescriptor.contains(currentDatasetDescriptor)) {
                        DatasetDescriptor edgeOutputDescriptor = makeOutputDescriptorSpecific(currentDatasetDescriptor, outputDatasetDescriptor);
                        FlowEdgeContext flowEdgeContext = new FlowEdgeContext(flowEdge, currentDatasetDescriptor, edgeOutputDescriptor, mergedConfig, specExecutor);
                        if (destDatasetDescriptor.getFormatConfig().contains(outputDatasetDescriptor.getFormatConfig())) {
                            /*
                Add to the front of the edge list if platform-independent properties of the output descriptor is compatible
                with those of destination dataset descriptor.
                In other words, we prioritize edges that perform data transformations as close to the source as possible.
                */
                            prioritizedEdgeList.add(0, flowEdgeContext);
                        } else {
                            prioritizedEdgeList.add(flowEdgeContext);
                        }
                        foundExecutor = true;
                    }
                }
                // TODO: Choose the min-cost executor for the FlowEdge as opposed to the first one that resolves.
                if (foundExecutor) {
                    break;
                }
            }
        } catch (IOException | ReflectiveOperationException | SpecNotFoundException | JobTemplate.TemplateException e) {
            // Skip the edge; and continue
            log.warn("Skipping edge {} with config {} due to exception: {}", flowEdge.getId(), flowConfig.toString(), e);
        }
    }
    return prioritizedEdgeList;
}
Also used : FlowEdge(org.apache.gobblin.service.modules.flowgraph.FlowEdge) DatasetDescriptor(org.apache.gobblin.service.modules.dataset.DatasetDescriptor) Config(com.typesafe.config.Config) ConfigException(com.typesafe.config.ConfigException) IOException(java.io.IOException) LinkedList(java.util.LinkedList) FlowEdgeContext(org.apache.gobblin.service.modules.flow.FlowEdgeContext) SpecNotFoundException(org.apache.gobblin.runtime.api.SpecNotFoundException) DataNode(org.apache.gobblin.service.modules.flowgraph.DataNode) SpecExecutor(org.apache.gobblin.runtime.api.SpecExecutor) Pair(org.apache.commons.lang3.tuple.Pair)

Aggregations

DataNode (org.apache.gobblin.service.modules.flowgraph.DataNode)12 Config (com.typesafe.config.Config)6 Test (org.testng.annotations.Test)5 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 SpecExecutor (org.apache.gobblin.runtime.api.SpecExecutor)3 FlowEdgeContext (org.apache.gobblin.service.modules.flow.FlowEdgeContext)3 MultiHopFlowCompilerTest (org.apache.gobblin.service.modules.flow.MultiHopFlowCompilerTest)3 Path (org.apache.hadoop.fs.Path)3 URI (java.net.URI)2 URISyntaxException (java.net.URISyntaxException)2 LinkedList (java.util.LinkedList)2 BaseFlowGraph (org.apache.gobblin.service.modules.flowgraph.BaseFlowGraph)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Joiner (com.google.common.base.Joiner)1 Optional (com.google.common.base.Optional)1 Preconditions (com.google.common.base.Preconditions)1 Throwables (com.google.common.base.Throwables)1 Lists (com.google.common.collect.Lists)1 ServiceManager (com.google.common.util.concurrent.ServiceManager)1