Search in sources :

Example 16 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class PipelinePlannerTest method testSimpleCondition.

@Test
public void testSimpleCondition() throws Exception {
    /*
      n1 - n2 - condition - n3
                      |
                      |---- n4
     */
    Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).build(), StageSpec.builder("n2", NODE).build(), StageSpec.builder("condition", CONDITION).build(), StageSpec.builder("n3", NODE).build(), StageSpec.builder("n4", NODE).build());
    Set<Connection> connections = ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition"), new Connection("condition", "n3", true), new Connection("condition", "n4", false));
    Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE, CONDITION.getType());
    Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
    Set<String> emptySet = ImmutableSet.of();
    PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
    PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
    Map<String, PipelinePhase> phases = new HashMap<>();
    /*
      n1--n2--condition.connector
     */
    PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).build()).addStage(StageSpec.builder("n2", NODE).build()).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SINK_TYPE)).build()).addConnection("n1", "n2").addConnection("n2", "condition.connector").build();
    Dag controlPhaseDag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition")));
    String phase1Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase1Name, phase1);
    /*
      condition
     */
    PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition", CONDITION).build()).build();
    String phase2Name = "condition";
    phases.put(phase2Name, phase2);
    /*
      condition.connector -- n3
     */
    PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n3", NODE).build()).addConnection("condition.connector", "n3").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n3")));
    String phase3Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase3Name, phase3);
    /*
      condition.connector -- n4
     */
    PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n4", NODE).build()).addConnection("condition.connector", "n4").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n4")));
    String phase4Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase4Name, phase4);
    Set<Connection> phaseConnections = new HashSet<>();
    phaseConnections.add(new Connection(phase1Name, phase2Name));
    phaseConnections.add(new Connection(phase2Name, phase3Name, true));
    phaseConnections.add(new Connection(phase2Name, phase4Name, false));
    PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
    PipelinePlan actual = planner.plan(pipelineSpec);
    Assert.assertEquals(expected, actual);
}
Also used : HashMap(java.util.HashMap) Connection(co.cask.cdap.etl.proto.Connection) PipelineSpec(co.cask.cdap.etl.spec.PipelineSpec) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 17 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class DynamicSparkCompute method lazyInit.

// when checkpointing is enabled, and Spark is loading DStream operations from an existing checkpoint,
// delegate will be null and the initialize() method won't have been called. So we need to instantiate
// the delegate and initialize it.
private void lazyInit(final JavaSparkContext jsc) throws Exception {
    if (delegate == null) {
        PluginFunctionContext pluginFunctionContext = dynamicDriverContext.getPluginFunctionContext();
        delegate = pluginFunctionContext.createPlugin();
        final StageSpec stageSpec = pluginFunctionContext.getStageSpec();
        final JavaSparkExecutionContext sec = dynamicDriverContext.getSparkExecutionContext();
        Transactionals.execute(sec, new TxRunnable() {

            @Override
            public void run(DatasetContext datasetContext) throws Exception {
                PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
                SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, pipelineRuntime, stageSpec);
                delegate.initialize(sparkPluginContext);
            }
        }, Exception.class);
    }
}
Also used : BasicSparkExecutionPluginContext(co.cask.cdap.etl.spark.batch.BasicSparkExecutionPluginContext) PluginFunctionContext(co.cask.cdap.etl.spark.function.PluginFunctionContext) SparkExecutionPluginContext(co.cask.cdap.etl.api.batch.SparkExecutionPluginContext) BasicSparkExecutionPluginContext(co.cask.cdap.etl.spark.batch.BasicSparkExecutionPluginContext) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) SparkPipelineRuntime(co.cask.cdap.etl.spark.SparkPipelineRuntime) SparkPipelineRuntime(co.cask.cdap.etl.spark.SparkPipelineRuntime) TxRunnable(co.cask.cdap.api.TxRunnable) StageSpec(co.cask.cdap.etl.spec.StageSpec) JavaSparkExecutionContext(co.cask.cdap.api.spark.JavaSparkExecutionContext) DatasetContext(co.cask.cdap.api.data.DatasetContext)

Example 18 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    final SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    List<Finisher> finishers = new ArrayList<>();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
    final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    final Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
    final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    final Admin admin = context.getAdmin();
    PipelinePhase phase = phaseSpec.getPhase();
    // go through in topological order so that arguments set by one stage are seen by stages after it
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
        final StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin = null;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {

                @Override
                public BatchSourceContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
            Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {

                @Override
                public StageSubmitterContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {

                @Override
                public BatchSinkContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {

                @Override
                public SparkPluginContext getContext(DatasetContext datasetContext) {
                    return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {

                @Override
                public void act(DefaultJoinerContext sparkJoinerContext) {
                    stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
                }
            });
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = new CompositeFinisher(finishers);
    context.localize("HydratorSpark.config", configFile.toURI());
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
    }
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SingleConnectorFactory(co.cask.cdap.etl.batch.connector.SingleConnectorFactory) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) SubmitterPlugin(co.cask.cdap.etl.common.submit.SubmitterPlugin) Finisher(co.cask.cdap.etl.common.submit.Finisher) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) StageSubmitterContext(co.cask.cdap.etl.api.StageSubmitterContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(co.cask.cdap.etl.common.BasicArguments) DatasetContext(co.cask.cdap.api.data.DatasetContext) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) ContextProvider(co.cask.cdap.etl.common.submit.ContextProvider) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) Map(java.util.Map) HashMap(java.util.HashMap) File(java.io.File) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) Admin(co.cask.cdap.api.Admin) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Transform(co.cask.cdap.etl.api.Transform) SparkConf(org.apache.spark.SparkConf) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) Writer(java.io.Writer) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Example 19 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class BatchSparkPipelineDriver method run.

@Override
public void run(DatasetContext context) throws Exception {
    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
    try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
        String object = reader.readLine();
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
        sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
        sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
        stagePartitions = sourceSinkInfo.getStagePartitions();
    }
    datasetContext = context;
    numOfRecordsPreview = phaseSpec.getNumOfRecordsPreview();
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    Map<String, StageStatisticsCollector> collectors = new HashMap<>();
    if (phaseSpec.pipelineContainsCondition()) {
        Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
        while (iterator.hasNext()) {
            StageSpec spec = iterator.next();
            collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
        }
    }
    try {
        PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
        runPipeline(phaseSpec.getPhase(), BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors);
    } finally {
        updateWorkflowToken(sec.getWorkflowToken(), collectors);
    }
}
Also used : Path(java.nio.file.Path) HashMap(java.util.HashMap) SingleConnectorFactory(co.cask.cdap.etl.batch.connector.SingleConnectorFactory) SparkStageStatisticsCollector(co.cask.cdap.etl.spark.SparkStageStatisticsCollector) SparkStageStatisticsCollector(co.cask.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(co.cask.cdap.etl.common.StageStatisticsCollector) StageSpec(co.cask.cdap.etl.spec.StageSpec) BufferedReader(java.io.BufferedReader) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext)

Example 20 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class PipelinePlanner method dagToPipeline.

/**
 * Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
 * The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
 *
 * @param dag the dag to convert
 * @param connectors connector nodes across all dags
 * @param specs specifications for every stage
 * @return the converted dag
 */
private PipelinePhase dagToPipeline(Dag dag, Map<String, String> connectors, Map<String, StageSpec> specs, Map<String, String> conditionConnectors) {
    PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
    for (String stageName : dag.getTopologicalOrder()) {
        Set<String> outputs = dag.getNodeOutputs(stageName);
        if (!outputs.isEmpty()) {
            phaseBuilder.addConnections(stageName, outputs);
        }
        // add connectors
        String originalName = connectors.get(stageName);
        if (originalName != null || conditionConnectors.values().contains(stageName)) {
            String connectorType = dag.getSources().contains(stageName) ? Constants.Connector.SOURCE_TYPE : Constants.Connector.SINK_TYPE;
            PluginSpec connectorSpec = new PluginSpec(Constants.Connector.PLUGIN_TYPE, "connector", ImmutableMap.of(Constants.Connector.ORIGINAL_NAME, originalName != null ? originalName : stageName, Constants.Connector.TYPE, connectorType), null);
            phaseBuilder.addStage(StageSpec.builder(stageName, connectorSpec).build());
            continue;
        }
        // add other plugin types
        StageSpec spec = specs.get(stageName);
        phaseBuilder.addStage(spec);
    }
    return phaseBuilder.build();
}
Also used : PluginSpec(co.cask.cdap.etl.spec.PluginSpec) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Aggregations

StageSpec (co.cask.cdap.etl.spec.StageSpec)27 HashMap (java.util.HashMap)20 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)15 Map (java.util.Map)10 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)8 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)7 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)7 Connection (co.cask.cdap.etl.proto.Connection)7 HashSet (java.util.HashSet)7 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)6 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)5 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)4 PluginContext (co.cask.cdap.api.plugin.PluginContext)4 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)4 LinkedHashMap (java.util.LinkedHashMap)4 Test (org.junit.Test)4 DatasetContext (co.cask.cdap.api.data.DatasetContext)2 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)2 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)2