use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class ETLBatchApplication method configure.
@Override
public void configure() {
ETLBatchConfig config = getConfig().convertOldConfig();
setDescription(DEFAULT_DESCRIPTION);
BatchPipelineSpec spec = new BatchPipelineSpecGenerator<>(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), config.getEngine()).generateSpec(config);
int sourceCount = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
sourceCount++;
}
}
if (sourceCount != 1) {
throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
}
PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
PipelinePlan plan = planner.plan(spec);
if (plan.getPhases().size() != 1) {
// should never happen if there is only one source
throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
}
PipelinePhase pipeline = plan.getPhases().values().iterator().next();
switch(config.getEngine()) {
case MAPREDUCE:
BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties(), false);
addMapReduce(new ETLMapReduce(batchPhaseSpec));
break;
case SPARK:
batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties(), false);
addSpark(new ETLSpark(batchPhaseSpec));
break;
default:
throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
}
addWorkflow(new ETLWorkflow(spec, config.getEngine()));
schedule(buildSchedule(SCHEDULE_NAME, ProgramType.WORKFLOW, ETLWorkflow.NAME).setDescription("ETL Batch schedule").triggerByTime(config.getSchedule()));
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class TransformRunner method getSinkWriter.
// this is needed because we need to write to the context differently depending on the number of outputs
private OutputWriter<Object, Object> getSinkWriter(MapReduceTaskContext<Object, Object> context, PipelinePhase pipelinePhase, Configuration hConf) {
Set<StageSpec> reducers = pipelinePhase.getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
JobContext hadoopContext = context.getHadoopContext();
if (!reducers.isEmpty() && hadoopContext instanceof Mapper.Context) {
return new SingleOutputWriter<>(context);
}
String sinkOutputsStr = hConf.get(ETLMapReduce.SINK_OUTPUTS_KEY);
// should never happen, this is set in initialize
Preconditions.checkNotNull(sinkOutputsStr, "Sink outputs not found in Hadoop conf.");
Map<String, SinkOutput> sinkOutputs = GSON.fromJson(sinkOutputsStr, ETLMapReduce.SINK_OUTPUTS_TYPE);
return hasSingleOutput(sinkOutputs) ? new SingleOutputWriter<>(context) : new MultiOutputWriter<>(context, sinkOutputs);
}
Aggregations