Search in sources :

Example 11 with PipelinePhase

use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.

the class ETLWorker method initialize.

@Override
public void initialize(final WorkerContext context) throws Exception {
    if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    super.initialize(context);
    Map<String, String> properties = context.getSpecification().getProperties();
    appName = context.getApplicationSpecification().getName();
    Preconditions.checkArgument(properties.containsKey(Constants.PIPELINEID));
    Preconditions.checkArgument(properties.containsKey(UNIQUE_ID));
    String uniqueId = properties.get(UNIQUE_ID);
    // Each worker instance should have its own unique state.
    final String appName = context.getApplicationSpecification().getName();
    stateStoreKey = String.format("%s%s%s%s%s", appName, SEPARATOR, uniqueId, SEPARATOR, context.getInstanceId());
    stateStoreKeyBytes = Bytes.toBytes(stateStoreKey);
    Transactionals.execute(getContext(), new TxRunnable() {

        @Override
        public void run(DatasetContext dsContext) throws Exception {
            KeyValueTable stateTable = dsContext.getDataset(ETLRealtimeApplication.STATE_TABLE);
            byte[] startKey = Bytes.toBytes(String.format("%s%s", appName, SEPARATOR));
            // Scan the table for appname: prefixes and remove rows which doesn't match the unique id of this application.
            try (CloseableIterator<KeyValue<byte[], byte[]>> rows = stateTable.scan(startKey, Bytes.stopKeyForPrefix(startKey))) {
                while (rows.hasNext()) {
                    KeyValue<byte[], byte[]> row = rows.next();
                    if (Bytes.compareTo(stateStoreKeyBytes, row.getKey()) != 0) {
                        stateTable.delete(row.getKey());
                    }
                }
            }
        }
    }, Exception.class);
    PipelinePhase pipeline = GSON.fromJson(properties.get(Constants.PIPELINEID), PipelinePhase.class);
    Map<String, TransformDetail> transformationMap = new HashMap<>();
    initializeSource(context, pipeline);
    initializeTransforms(context, transformationMap, pipeline);
    initializeSinks(context, transformationMap, pipeline);
    Set<String> startStages = new HashSet<>();
    startStages.addAll(pipeline.getStageOutputs(sourceStageName));
    transformExecutor = new TransformExecutor(transformationMap, startStages);
}
Also used : CloseableIterator(co.cask.cdap.api.dataset.lib.CloseableIterator) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) HashMap(java.util.HashMap) IOException(java.io.IOException) TxRunnable(co.cask.cdap.api.TxRunnable) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) TransformDetail(co.cask.cdap.etl.common.TransformDetail) TransformExecutor(co.cask.cdap.etl.common.TransformExecutor) DatasetContext(co.cask.cdap.api.data.DatasetContext) HashSet(java.util.HashSet)

Example 12 with PipelinePhase

use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.

the class PipelinePlannerTest method testSimpleCondition.

@Test
public void testSimpleCondition() throws Exception {
    /*
      n1 - n2 - condition - n3
                      |
                      |---- n4
     */
    Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).build(), StageSpec.builder("n2", NODE).build(), StageSpec.builder("condition", CONDITION).build(), StageSpec.builder("n3", NODE).build(), StageSpec.builder("n4", NODE).build());
    Set<Connection> connections = ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition"), new Connection("condition", "n3", true), new Connection("condition", "n4", false));
    Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE, CONDITION.getType());
    Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
    Set<String> emptySet = ImmutableSet.of();
    PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
    PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
    Map<String, PipelinePhase> phases = new HashMap<>();
    /*
      n1--n2--condition.connector
     */
    PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).build()).addStage(StageSpec.builder("n2", NODE).build()).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SINK_TYPE)).build()).addConnection("n1", "n2").addConnection("n2", "condition.connector").build();
    Dag controlPhaseDag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition")));
    String phase1Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase1Name, phase1);
    /*
      condition
     */
    PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition", CONDITION).build()).build();
    String phase2Name = "condition";
    phases.put(phase2Name, phase2);
    /*
      condition.connector -- n3
     */
    PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n3", NODE).build()).addConnection("condition.connector", "n3").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n3")));
    String phase3Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase3Name, phase3);
    /*
      condition.connector -- n4
     */
    PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n4", NODE).build()).addConnection("condition.connector", "n4").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n4")));
    String phase4Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase4Name, phase4);
    Set<Connection> phaseConnections = new HashSet<>();
    phaseConnections.add(new Connection(phase1Name, phase2Name));
    phaseConnections.add(new Connection(phase2Name, phase3Name, true));
    phaseConnections.add(new Connection(phase2Name, phase4Name, false));
    PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
    PipelinePlan actual = planner.plan(pipelineSpec);
    Assert.assertEquals(expected, actual);
}
Also used : HashMap(java.util.HashMap) Connection(co.cask.cdap.etl.proto.Connection) PipelineSpec(co.cask.cdap.etl.spec.PipelineSpec) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 13 with PipelinePhase

use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    final SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    List<Finisher> finishers = new ArrayList<>();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
    final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    final Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
    final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    final Admin admin = context.getAdmin();
    PipelinePhase phase = phaseSpec.getPhase();
    // go through in topological order so that arguments set by one stage are seen by stages after it
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
        final StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin = null;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {

                @Override
                public BatchSourceContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
            Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {

                @Override
                public StageSubmitterContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {

                @Override
                public BatchSinkContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {

                @Override
                public SparkPluginContext getContext(DatasetContext datasetContext) {
                    return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {

                @Override
                public void act(DefaultJoinerContext sparkJoinerContext) {
                    stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
                }
            });
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = new CompositeFinisher(finishers);
    context.localize("HydratorSpark.config", configFile.toURI());
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
    }
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SingleConnectorFactory(co.cask.cdap.etl.batch.connector.SingleConnectorFactory) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) SubmitterPlugin(co.cask.cdap.etl.common.submit.SubmitterPlugin) Finisher(co.cask.cdap.etl.common.submit.Finisher) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) StageSubmitterContext(co.cask.cdap.etl.api.StageSubmitterContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(co.cask.cdap.etl.common.BasicArguments) DatasetContext(co.cask.cdap.api.data.DatasetContext) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) ContextProvider(co.cask.cdap.etl.common.submit.ContextProvider) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) Map(java.util.Map) HashMap(java.util.HashMap) File(java.io.File) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) Admin(co.cask.cdap.api.Admin) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Transform(co.cask.cdap.etl.api.Transform) SparkConf(org.apache.spark.SparkConf) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) Writer(java.io.Writer) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Example 14 with PipelinePhase

use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.

the class PipelinePlanner method dagToPipeline.

/**
 * Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
 * The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
 *
 * @param dag the dag to convert
 * @param connectors connector nodes across all dags
 * @param specs specifications for every stage
 * @return the converted dag
 */
private PipelinePhase dagToPipeline(Dag dag, Map<String, String> connectors, Map<String, StageSpec> specs, Map<String, String> conditionConnectors) {
    PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
    for (String stageName : dag.getTopologicalOrder()) {
        Set<String> outputs = dag.getNodeOutputs(stageName);
        if (!outputs.isEmpty()) {
            phaseBuilder.addConnections(stageName, outputs);
        }
        // add connectors
        String originalName = connectors.get(stageName);
        if (originalName != null || conditionConnectors.values().contains(stageName)) {
            String connectorType = dag.getSources().contains(stageName) ? Constants.Connector.SOURCE_TYPE : Constants.Connector.SINK_TYPE;
            PluginSpec connectorSpec = new PluginSpec(Constants.Connector.PLUGIN_TYPE, "connector", ImmutableMap.of(Constants.Connector.ORIGINAL_NAME, originalName != null ? originalName : stageName, Constants.Connector.TYPE, connectorType), null);
            phaseBuilder.addStage(StageSpec.builder(stageName, connectorSpec).build());
            continue;
        }
        // add other plugin types
        StageSpec spec = specs.get(stageName);
        phaseBuilder.addStage(spec);
    }
    return phaseBuilder.build();
}
Also used : PluginSpec(co.cask.cdap.etl.spec.PluginSpec) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Example 15 with PipelinePhase

use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.

the class SmartWorkflow method addProgram.

private WorkflowProgramAdder addProgram(String phaseName, WorkflowProgramAdder programAdder) {
    PipelinePhase phase = plan.getPhase(phaseName);
    // artificially added by the control dag flattening process. So nothing to add, skip it
    if (phase == null) {
        return programAdder;
    }
    // can't use phase name as a program name because it might contain invalid characters
    String programName = "phase-" + phaseNum;
    phaseNum++;
    BatchPhaseSpec batchPhaseSpec = getPhaseSpec(programName, phase);
    Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
    if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
        // actions will be all by themselves in a phase
        programAdder.addAction(new PipelineAction(batchPhaseSpec));
    } else if (pluginTypes.contains(Condition.PLUGIN_TYPE)) {
        // conditions will be all by themselves in a phase
        // addCondition(programAdder, phaseName, batchPhaseSpec);
        programAdder = programAdder.condition(new PipelineCondition(batchPhaseSpec));
    } else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
        // spark programs will be all by themselves in a phase
        String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
        StageSpec stageSpec = stageSpecs.get(stageName);
        applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
        programAdder.addSpark(programName);
    } else if (useSpark) {
        applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
        programAdder.addSpark(programName);
    } else {
        applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec, new HashSet<>(connectorDatasets.values())));
        programAdder.addMapReduce(programName);
    }
    return programAdder;
}
Also used : ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) PipelineCondition(co.cask.cdap.etl.batch.condition.PipelineCondition) PipelineAction(co.cask.cdap.etl.batch.customaction.PipelineAction)

Aggregations

PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)17 StageSpec (co.cask.cdap.etl.spec.StageSpec)15 HashMap (java.util.HashMap)13 HashSet (java.util.HashSet)7 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)5 Connection (co.cask.cdap.etl.proto.Connection)5 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)4 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)4 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)4 Map (java.util.Map)4 Test (org.junit.Test)4 DatasetContext (co.cask.cdap.api.data.DatasetContext)3 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)3 PluginContext (co.cask.cdap.api.plugin.PluginContext)3 TxRunnable (co.cask.cdap.api.TxRunnable)2 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)2 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)2 BatchConfigurable (co.cask.cdap.etl.api.batch.BatchConfigurable)2 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)2