use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class SmartWorkflow method addProgram.
private void addProgram(String phaseName, WorkflowProgramAdder programAdder) {
PipelinePhase phase = plan.getPhase(phaseName);
// artificially added by the control dag flattening process. So nothing to add, skip it
if (phase == null) {
return;
}
// can't use phase name as a program name because it might contain invalid characters
String programName = "phase-" + phaseNum;
phaseNum++;
// if this phase uses connectors, add the local dataset for that connector if we haven't already
for (StageInfo connectorInfo : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
String connectorName = connectorInfo.getName();
String datasetName = connectorDatasets.get(connectorName);
if (datasetName == null) {
datasetName = "conn-" + connectorNum++;
connectorDatasets.put(connectorName, datasetName);
// add the local dataset
ConnectorSource connectorSource = new ConnectorSource(datasetName, null);
connectorSource.configure(getConfigurer());
}
}
Map<String, String> phaseConnectorDatasets = new HashMap<>();
for (StageInfo connectorStage : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
phaseConnectorDatasets.put(connectorStage.getName(), connectorDatasets.get(connectorStage.getName()));
}
BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(programName, phase, spec.getResources(), spec.getDriverResources(), spec.getClientResources(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled(), phaseConnectorDatasets, spec.getNumOfRecordsPreview(), spec.getProperties());
Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
// actions will be all by themselves in a phase
programAdder.addAction(new PipelineAction(batchPhaseSpec));
} else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
// spark programs will be all by themselves in a phase
String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
StageSpec stageSpec = stageSpecs.get(stageName);
applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
programAdder.addSpark(programName);
} else if (useSpark) {
applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
programAdder.addSpark(programName);
} else {
applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec));
programAdder.addMapReduce(programName);
}
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class PipelinePlanner method dagToPipeline.
/**
* Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
* The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
*
* @param pipelineSpec the overall pipeline spec
* @param dag the dag to convert
* @param connectors connector nodes across all dags
* @param specs specifications for every stage
* @return the converted dag
*/
private PipelinePhase dagToPipeline(PipelineSpec pipelineSpec, Dag dag, Set<String> connectors, Map<String, StageSpec> specs) {
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
for (String stageName : dag.getTopologicalOrder()) {
Set<String> outputs = dag.getNodeOutputs(stageName);
if (!outputs.isEmpty()) {
phaseBuilder.addConnections(stageName, outputs);
}
// add connectors
if (connectors.contains(stageName)) {
phaseBuilder.addStage(StageInfo.builder(stageName, Constants.CONNECTOR_TYPE).build());
continue;
}
// add other plugin types
StageSpec spec = specs.get(stageName);
String pluginType = spec.getPlugin().getType();
phaseBuilder.addStage(StageInfo.builder(stageName, pluginType).addInputs(spec.getInputs()).addInputSchemas(spec.getInputSchemas()).addOutputs(spec.getOutputs()).setOutputSchema(spec.getOutputSchema()).setErrorSchema(spec.getErrorSchema()).setErrorDatasetName(spec.getErrorDatasetName()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build());
}
return phaseBuilder.build();
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class ETLBatchApplication method configure.
@Override
public void configure() {
ETLBatchConfig config = getConfig().convertOldConfig();
setDescription(DEFAULT_DESCRIPTION);
PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> specGenerator = new BatchPipelineSpecGenerator(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), TimePartitionedFileSet.class, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString()).build(), config.getEngine());
BatchPipelineSpec spec = specGenerator.generateSpec(config);
int sourceCount = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
sourceCount++;
}
}
if (sourceCount != 1) {
throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
}
PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
PipelinePlan plan = planner.plan(spec);
if (plan.getPhases().size() != 1) {
// should never happen if there is only one source
throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
}
PipelinePhase pipeline = plan.getPhases().values().iterator().next();
switch(config.getEngine()) {
case MAPREDUCE:
BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
addMapReduce(new ETLMapReduce(batchPhaseSpec));
break;
case SPARK:
batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
addSpark(new ETLSpark(batchPhaseSpec));
break;
default:
throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
}
addWorkflow(new ETLWorkflow(spec, config.getEngine()));
scheduleWorkflow(Schedules.builder(SCHEDULE_NAME).setDescription("ETL Batch schedule").createTimeSchedule(config.getSchedule()), ETLWorkflow.NAME);
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class SparkStreamingPipelineDriver method run.
@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
final DataStreamsPipelineSpec pipelineSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(SUPPORTED_PLUGIN_TYPES).addConnections(pipelineSpec.getConnections());
for (StageSpec stageSpec : pipelineSpec.getStages()) {
phaseBuilder.addStage(StageInfo.builder(stageSpec.getName(), stageSpec.getPlugin().getType()).addInputs(stageSpec.getInputs()).addOutputs(stageSpec.getOutputs()).addInputSchemas(stageSpec.getInputSchemas()).setOutputSchema(stageSpec.getOutputSchema()).setErrorSchema(stageSpec.getErrorSchema()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build());
}
final PipelinePhase pipelinePhase = phaseBuilder.build();
boolean checkpointsDisabled = pipelineSpec.isCheckpointsDisabled();
String checkpointDir = null;
if (!checkpointsDisabled) {
// Get the location of the checkpoint directory.
String pipelineName = sec.getApplicationSpecification().getName();
String relativeCheckpointDir = pipelineSpec.getCheckpointDirectory();
// there isn't any way to instantiate the fileset except in a TxRunnable, so need to use a reference.
final AtomicReference<Location> checkpointBaseRef = new AtomicReference<>();
Transactionals.execute(sec, new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
checkpointBaseRef.set(checkpointFileSet.getBaseLocation());
}
}, Exception.class);
Location pipelineCheckpointDir = checkpointBaseRef.get().append(pipelineName).append(relativeCheckpointDir);
checkpointDir = pipelineCheckpointDir.toURI().toString();
}
JavaStreamingContext jssc = run(pipelineSpec, pipelinePhase, sec, checkpointDir);
jssc.start();
boolean stopped = false;
try {
// most programs will just keep running forever.
// however, when CDAP stops the program, we get an interrupted exception.
// at that point, we need to call stop on jssc, otherwise the program will hang and never stop.
stopped = jssc.awaitTerminationOrTimeout(Long.MAX_VALUE);
} finally {
if (!stopped) {
jssc.stop(true, pipelineSpec.isStopGracefully());
}
}
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class DataStreamsSparkLauncher method initialize.
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
int numSources = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
numSources = numSources + streamingSource.getRequiredExecutors();
}
}
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.streaming.backpressure.enabled", "true");
for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
sparkConf.set(property.getKey(), property.getValue());
}
// spark... makes you set this to at least the number of receivers (streaming sources)
// because it holds one thread per receiver, or one core in distributed mode.
// so... we have to set this hacky master variable based on the isUnitTest setting in the config
String extraOpts = spec.getExtraJavaOpts();
if (extraOpts != null && !extraOpts.isEmpty()) {
sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
}
// without this, stopping will hang on machines with few cores.
sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
sparkConf.setMaster(String.format("local[%d]", numSources + 2));
if (spec.isUnitTest()) {
sparkConf.setMaster(String.format("local[%d]", numSources + 1));
}
context.setSparkConf(sparkConf);
if (!spec.isCheckpointsDisabled()) {
// Each pipeline has its own checkpoint directory within the checkpoint fileset.
// Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
// This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
// Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
// and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
// On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
String pipelineName = context.getApplicationSpecification().getName();
String checkpointDir = spec.getCheckpointDirectory();
Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
if (!ensureDirExists(pipelineCheckpointBase)) {
throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
}
try {
for (Location child : pipelineCheckpointBase.list()) {
if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
}
}
} catch (Exception e) {
LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
}
if (!ensureDirExists(pipelineCheckpointDir)) {
throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
}
}
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Aggregations