Search in sources :

Example 26 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class ETLBatchApplication method configure.

@Override
public void configure() {
    ETLBatchConfig config = getConfig().convertOldConfig();
    setDescription(DEFAULT_DESCRIPTION);
    BatchPipelineSpec spec = new BatchPipelineSpecGenerator<>(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), config.getEngine()).generateSpec(config);
    int sourceCount = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            sourceCount++;
        }
    }
    if (sourceCount != 1) {
        throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
    }
    PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
    PipelinePlan plan = planner.plan(spec);
    if (plan.getPhases().size() != 1) {
        // should never happen if there is only one source
        throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
    }
    PipelinePhase pipeline = plan.getPhases().values().iterator().next();
    switch(config.getEngine()) {
        case MAPREDUCE:
            BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties(), false);
            addMapReduce(new ETLMapReduce(batchPhaseSpec));
            break;
        case SPARK:
            batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties(), false);
            addSpark(new ETLSpark(batchPhaseSpec));
            break;
        default:
            throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
    }
    addWorkflow(new ETLWorkflow(spec, config.getEngine()));
    schedule(buildSchedule(SCHEDULE_NAME, ProgramType.WORKFLOW, ETLWorkflow.NAME).setDescription("ETL Batch schedule").triggerByTime(config.getSchedule()));
}
Also used : PipelinePlan(co.cask.cdap.etl.planner.PipelinePlan) ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) HashMap(java.util.HashMap) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Example 27 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class TransformRunner method getSinkWriter.

// this is needed because we need to write to the context differently depending on the number of outputs
private OutputWriter<Object, Object> getSinkWriter(MapReduceTaskContext<Object, Object> context, PipelinePhase pipelinePhase, Configuration hConf) {
    Set<StageSpec> reducers = pipelinePhase.getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    JobContext hadoopContext = context.getHadoopContext();
    if (!reducers.isEmpty() && hadoopContext instanceof Mapper.Context) {
        return new SingleOutputWriter<>(context);
    }
    String sinkOutputsStr = hConf.get(ETLMapReduce.SINK_OUTPUTS_KEY);
    // should never happen, this is set in initialize
    Preconditions.checkNotNull(sinkOutputsStr, "Sink outputs not found in Hadoop conf.");
    Map<String, SinkOutput> sinkOutputs = GSON.fromJson(sinkOutputsStr, ETLMapReduce.SINK_OUTPUTS_TYPE);
    return hasSingleOutput(sinkOutputs) ? new SingleOutputWriter<>(context) : new MultiOutputWriter<>(context, sinkOutputs);
}
Also used : Mapper(org.apache.hadoop.mapreduce.Mapper) StageSpec(co.cask.cdap.etl.spec.StageSpec) JobContext(org.apache.hadoop.mapreduce.JobContext)

Aggregations

StageSpec (co.cask.cdap.etl.spec.StageSpec)27 HashMap (java.util.HashMap)20 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)15 Map (java.util.Map)10 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)8 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)7 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)7 Connection (co.cask.cdap.etl.proto.Connection)7 HashSet (java.util.HashSet)7 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)6 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)5 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)4 PluginContext (co.cask.cdap.api.plugin.PluginContext)4 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)4 LinkedHashMap (java.util.LinkedHashMap)4 Test (org.junit.Test)4 DatasetContext (co.cask.cdap.api.data.DatasetContext)2 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)2 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)2