Search in sources :

Example 16 with PipelinePhase

use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.

the class SparkStreamingPipelineDriver method run.

@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
    final DataStreamsPipelineSpec pipelineSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    final PipelinePhase pipelinePhase = PipelinePhase.builder(SUPPORTED_PLUGIN_TYPES).addConnections(pipelineSpec.getConnections()).addStages(pipelineSpec.getStages()).build();
    boolean checkpointsDisabled = pipelineSpec.isCheckpointsDisabled();
    String checkpointDir = null;
    if (!checkpointsDisabled) {
        // Get the location of the checkpoint directory.
        String pipelineName = sec.getApplicationSpecification().getName();
        String relativeCheckpointDir = pipelineSpec.getCheckpointDirectory();
        // there isn't any way to instantiate the fileset except in a TxRunnable, so need to use a reference.
        final AtomicReference<Location> checkpointBaseRef = new AtomicReference<>();
        Transactionals.execute(sec, new TxRunnable() {

            @Override
            public void run(DatasetContext context) throws Exception {
                FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
                checkpointBaseRef.set(checkpointFileSet.getBaseLocation());
            }
        }, Exception.class);
        Location pipelineCheckpointDir = checkpointBaseRef.get().append(pipelineName).append(relativeCheckpointDir);
        checkpointDir = pipelineCheckpointDir.toURI().toString();
    }
    JavaStreamingContext jssc = run(pipelineSpec, pipelinePhase, sec, checkpointDir);
    jssc.start();
    boolean stopped = false;
    try {
        // most programs will just keep running forever.
        // however, when CDAP stops the program, we get an interrupted exception.
        // at that point, we need to call stop on jssc, otherwise the program will hang and never stop.
        stopped = jssc.awaitTerminationOrTimeout(Long.MAX_VALUE);
    } finally {
        if (!stopped) {
            jssc.stop(true, pipelineSpec.isStopGracefully());
        }
    }
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) AtomicReference(java.util.concurrent.atomic.AtomicReference) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) TxRunnable(co.cask.cdap.api.TxRunnable) DatasetContext(co.cask.cdap.api.data.DatasetContext) Location(org.apache.twill.filesystem.Location)

Example 17 with PipelinePhase

use of co.cask.cdap.etl.common.PipelinePhase in project cdap by caskdata.

the class ETLBatchApplication method configure.

@Override
public void configure() {
    ETLBatchConfig config = getConfig().convertOldConfig();
    setDescription(DEFAULT_DESCRIPTION);
    BatchPipelineSpec spec = new BatchPipelineSpecGenerator<>(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), config.getEngine()).generateSpec(config);
    int sourceCount = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            sourceCount++;
        }
    }
    if (sourceCount != 1) {
        throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
    }
    PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
    PipelinePlan plan = planner.plan(spec);
    if (plan.getPhases().size() != 1) {
        // should never happen if there is only one source
        throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
    }
    PipelinePhase pipeline = plan.getPhases().values().iterator().next();
    switch(config.getEngine()) {
        case MAPREDUCE:
            BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties(), false);
            addMapReduce(new ETLMapReduce(batchPhaseSpec));
            break;
        case SPARK:
            batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties(), false);
            addSpark(new ETLSpark(batchPhaseSpec));
            break;
        default:
            throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
    }
    addWorkflow(new ETLWorkflow(spec, config.getEngine()));
    schedule(buildSchedule(SCHEDULE_NAME, ProgramType.WORKFLOW, ETLWorkflow.NAME).setDescription("ETL Batch schedule").triggerByTime(config.getSchedule()));
}
Also used : PipelinePlan(co.cask.cdap.etl.planner.PipelinePlan) ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) HashMap(java.util.HashMap) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Aggregations

PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)17 StageSpec (co.cask.cdap.etl.spec.StageSpec)15 HashMap (java.util.HashMap)13 HashSet (java.util.HashSet)7 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)5 Connection (co.cask.cdap.etl.proto.Connection)5 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)4 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)4 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)4 Map (java.util.Map)4 Test (org.junit.Test)4 DatasetContext (co.cask.cdap.api.data.DatasetContext)3 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)3 PluginContext (co.cask.cdap.api.plugin.PluginContext)3 TxRunnable (co.cask.cdap.api.TxRunnable)2 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)2 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)2 BatchConfigurable (co.cask.cdap.etl.api.batch.BatchConfigurable)2 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)2