use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class SmartWorkflow method initialize.
@Override
public void initialize(WorkflowContext context) throws Exception {
super.initialize(context);
TriggeringScheduleInfo scheduleInfo = context.getTriggeringScheduleInfo();
if (scheduleInfo != null) {
String propertiesMappingString = scheduleInfo.getProperties().get(TRIGGERING_PROPERTIES_MAPPING);
if (propertiesMappingString != null) {
TriggeringPropertyMapping propertiesMapping = GSON.fromJson(propertiesMappingString, TriggeringPropertyMapping.class);
updateTokenWithTriggeringProperties(scheduleInfo, propertiesMapping, context.getToken());
}
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, workflowMetrics);
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), pipelineRuntime.getArguments().asMap());
alertPublishers = new HashMap<>();
postActions = new LinkedHashMap<>();
spec = GSON.fromJson(context.getWorkflowSpecification().getProperty(Constants.PIPELINE_SPEC_KEY), BatchPipelineSpec.class);
stageSpecs = new HashMap<>();
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace());
PluginContext pluginContext = new PipelinePluginContext(context, workflowMetrics, spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled());
for (ActionSpec actionSpec : spec.getEndingActions()) {
String stageName = actionSpec.getName();
postActions.put(stageName, (PostAction) pluginContext.newPluginInstance(stageName, macroEvaluator));
stageSpecs.put(stageName, StageSpec.builder(stageName, actionSpec.getPluginSpec()).setStageLoggingEnabled(spec.isStageLoggingEnabled()).setProcessTimingEnabled(spec.isProcessTimingEnabled()).build());
}
for (StageSpec stageSpec : spec.getStages()) {
String stageName = stageSpec.getName();
stageSpecs.put(stageName, stageSpec);
if (AlertPublisher.PLUGIN_TYPE.equals(stageSpec.getPluginType())) {
AlertPublisher alertPublisher = context.newPluginInstance(stageName, macroEvaluator);
alertPublishers.put(stageName, alertPublisher);
}
}
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class SmartWorkflow method getPhaseSpec.
private BatchPhaseSpec getPhaseSpec(String programName, PipelinePhase phase) {
// if this phase uses connectors, add the local dataset for that connector if we haven't already
for (StageSpec connectorInfo : phase.getStagesOfType(Constants.Connector.PLUGIN_TYPE)) {
String connectorName = connectorInfo.getName();
String datasetName = connectorDatasets.get(connectorName);
if (datasetName == null) {
datasetName = "conn-" + connectorNum++;
connectorDatasets.put(connectorName, datasetName);
// add the local dataset
ConnectorSource connectorSource = new MultiConnectorSource(datasetName, null);
connectorSource.configure(getConfigurer());
}
}
// published.
for (StageSpec alertPublisherInfo : phase.getStagesOfType(AlertPublisher.PLUGIN_TYPE)) {
String stageName = alertPublisherInfo.getName();
AlertPublisherSink alertPublisherSink = new AlertPublisherSink(stageName, null);
alertPublisherSink.configure(getConfigurer());
}
Map<String, String> phaseConnectorDatasets = new HashMap<>();
for (StageSpec connectorStage : phase.getStagesOfType(Constants.Connector.PLUGIN_TYPE)) {
phaseConnectorDatasets.put(connectorStage.getName(), connectorDatasets.get(connectorStage.getName()));
}
return new BatchPhaseSpec(programName, phase, spec.getResources(), spec.getDriverResources(), spec.getClientResources(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled(), phaseConnectorDatasets, spec.getNumOfRecordsPreview(), spec.getProperties(), !plan.getConditionPhaseBranches().isEmpty());
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class SmartWorkflow method destroy.
@Override
public void destroy() {
WorkflowContext workflowContext = getContext();
PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
// Execute the post actions only if pipeline is not running in preview mode.
if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
String name = endingActionEntry.getKey();
PostAction action = endingActionEntry.getValue();
StageSpec stageSpec = stageSpecs.get(name);
BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
try {
action.run(context);
} catch (Throwable t) {
LOG.error("Error while running post action {}.", name, t);
}
}
}
// publish all alerts
for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
String name = alertPublisherEntry.getKey();
AlertPublisher alertPublisher = alertPublisherEntry.getValue();
PartitionedFileSet alertConnector = workflowContext.getDataset(name);
try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector.getPartitions(PartitionFilter.ALWAYS_MATCH))) {
if (!alerts.hasNext()) {
continue;
}
StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, name);
StageSpec stageSpec = stageSpecs.get(name);
AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
alertPublisher.initialize(alertContext);
TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
alertPublisher.publish(trackedIterator);
} catch (Exception e) {
LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", name, e);
} finally {
try {
alertPublisher.destroy();
} catch (Exception e) {
LOG.warn("Error destroying alert publisher for stage {}", name, e);
}
}
}
ProgramStatus status = getContext().getState().getStatus();
if (status == ProgramStatus.FAILED) {
WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
} else {
WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
}
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext.getNamespace());
// Get resolved plugin properties
Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
for (StageSpec spec : stageSpecs.values()) {
String stageName = spec.getName();
resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
}
// Add resolved plugin properties to workflow token as a JSON String
workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class SmartWorkflow method addProgram.
private WorkflowProgramAdder addProgram(String phaseName, WorkflowProgramAdder programAdder) {
PipelinePhase phase = plan.getPhase(phaseName);
// artificially added by the control dag flattening process. So nothing to add, skip it
if (phase == null) {
return programAdder;
}
// can't use phase name as a program name because it might contain invalid characters
String programName = "phase-" + phaseNum;
phaseNum++;
BatchPhaseSpec batchPhaseSpec = getPhaseSpec(programName, phase);
Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
// actions will be all by themselves in a phase
programAdder.addAction(new PipelineAction(batchPhaseSpec));
} else if (pluginTypes.contains(Condition.PLUGIN_TYPE)) {
// conditions will be all by themselves in a phase
// addCondition(programAdder, phaseName, batchPhaseSpec);
programAdder = programAdder.condition(new PipelineCondition(batchPhaseSpec));
} else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
// spark programs will be all by themselves in a phase
String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
StageSpec stageSpec = stageSpecs.get(stageName);
applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
programAdder.addSpark(programName);
} else if (useSpark) {
applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
programAdder.addSpark(programName);
} else {
applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec, new HashSet<>(connectorDatasets.values())));
programAdder.addMapReduce(programName);
}
return programAdder;
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class DataStreamsSparkLauncher method initialize.
@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
int numSources = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
numSources = numSources + streamingSource.getRequiredExecutors();
}
}
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.streaming.backpressure.enabled", "true");
for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
sparkConf.set(property.getKey(), property.getValue());
}
// spark... makes you set this to at least the number of receivers (streaming sources)
// because it holds one thread per receiver, or one core in distributed mode.
// so... we have to set this hacky master variable based on the isUnitTest setting in the config
String extraOpts = spec.getExtraJavaOpts();
if (extraOpts != null && !extraOpts.isEmpty()) {
sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
}
// without this, stopping will hang on machines with few cores.
sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
sparkConf.setMaster(String.format("local[%d]", numSources + 2));
if (spec.isUnitTest()) {
sparkConf.setMaster(String.format("local[%d]", numSources + 1));
}
context.setSparkConf(sparkConf);
if (!spec.isCheckpointsDisabled()) {
// Each pipeline has its own checkpoint directory within the checkpoint fileset.
// Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
// This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
// Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
// and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
// On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
String pipelineName = context.getApplicationSpecification().getName();
String checkpointDir = spec.getCheckpointDirectory();
Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
if (!ensureDirExists(pipelineCheckpointBase)) {
throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
}
try {
for (Location child : pipelineCheckpointBase.list()) {
if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
}
}
} catch (Exception e) {
LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
}
if (!ensureDirExists(pipelineCheckpointDir)) {
throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
}
}
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Aggregations