Search in sources :

Example 21 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class SmartWorkflow method initialize.

@Override
public void initialize(WorkflowContext context) throws Exception {
    super.initialize(context);
    TriggeringScheduleInfo scheduleInfo = context.getTriggeringScheduleInfo();
    if (scheduleInfo != null) {
        String propertiesMappingString = scheduleInfo.getProperties().get(TRIGGERING_PROPERTIES_MAPPING);
        if (propertiesMappingString != null) {
            TriggeringPropertyMapping propertiesMapping = GSON.fromJson(propertiesMappingString, TriggeringPropertyMapping.class);
            updateTokenWithTriggeringProperties(scheduleInfo, propertiesMapping, context.getToken());
        }
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, workflowMetrics);
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), pipelineRuntime.getArguments().asMap());
    alertPublishers = new HashMap<>();
    postActions = new LinkedHashMap<>();
    spec = GSON.fromJson(context.getWorkflowSpecification().getProperty(Constants.PIPELINE_SPEC_KEY), BatchPipelineSpec.class);
    stageSpecs = new HashMap<>();
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    PluginContext pluginContext = new PipelinePluginContext(context, workflowMetrics, spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled());
    for (ActionSpec actionSpec : spec.getEndingActions()) {
        String stageName = actionSpec.getName();
        postActions.put(stageName, (PostAction) pluginContext.newPluginInstance(stageName, macroEvaluator));
        stageSpecs.put(stageName, StageSpec.builder(stageName, actionSpec.getPluginSpec()).setStageLoggingEnabled(spec.isStageLoggingEnabled()).setProcessTimingEnabled(spec.isProcessTimingEnabled()).build());
    }
    for (StageSpec stageSpec : spec.getStages()) {
        String stageName = stageSpec.getName();
        stageSpecs.put(stageName, stageSpec);
        if (AlertPublisher.PLUGIN_TYPE.equals(stageSpec.getPluginType())) {
            AlertPublisher alertPublisher = context.newPluginInstance(stageName, macroEvaluator);
            alertPublishers.put(stageName, alertPublisher);
        }
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) ActionSpec(co.cask.cdap.etl.batch.ActionSpec) AlertPublisher(co.cask.cdap.etl.api.AlertPublisher) PluginContext(co.cask.cdap.api.plugin.PluginContext) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) TriggeringScheduleInfo(co.cask.cdap.api.schedule.TriggeringScheduleInfo) BatchPipelineSpec(co.cask.cdap.etl.batch.BatchPipelineSpec) TriggeringPropertyMapping(co.cask.cdap.etl.proto.v2.TriggeringPropertyMapping) StageSpec(co.cask.cdap.etl.spec.StageSpec) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext)

Example 22 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class SmartWorkflow method getPhaseSpec.

private BatchPhaseSpec getPhaseSpec(String programName, PipelinePhase phase) {
    // if this phase uses connectors, add the local dataset for that connector if we haven't already
    for (StageSpec connectorInfo : phase.getStagesOfType(Constants.Connector.PLUGIN_TYPE)) {
        String connectorName = connectorInfo.getName();
        String datasetName = connectorDatasets.get(connectorName);
        if (datasetName == null) {
            datasetName = "conn-" + connectorNum++;
            connectorDatasets.put(connectorName, datasetName);
            // add the local dataset
            ConnectorSource connectorSource = new MultiConnectorSource(datasetName, null);
            connectorSource.configure(getConfigurer());
        }
    }
    // published.
    for (StageSpec alertPublisherInfo : phase.getStagesOfType(AlertPublisher.PLUGIN_TYPE)) {
        String stageName = alertPublisherInfo.getName();
        AlertPublisherSink alertPublisherSink = new AlertPublisherSink(stageName, null);
        alertPublisherSink.configure(getConfigurer());
    }
    Map<String, String> phaseConnectorDatasets = new HashMap<>();
    for (StageSpec connectorStage : phase.getStagesOfType(Constants.Connector.PLUGIN_TYPE)) {
        phaseConnectorDatasets.put(connectorStage.getName(), connectorDatasets.get(connectorStage.getName()));
    }
    return new BatchPhaseSpec(programName, phase, spec.getResources(), spec.getDriverResources(), spec.getClientResources(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled(), phaseConnectorDatasets, spec.getNumOfRecordsPreview(), spec.getProperties(), !plan.getConditionPhaseBranches().isEmpty());
}
Also used : MultiConnectorSource(co.cask.cdap.etl.batch.connector.MultiConnectorSource) MultiConnectorSource(co.cask.cdap.etl.batch.connector.MultiConnectorSource) ConnectorSource(co.cask.cdap.etl.batch.connector.ConnectorSource) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) StageSpec(co.cask.cdap.etl.spec.StageSpec) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) AlertPublisherSink(co.cask.cdap.etl.batch.connector.AlertPublisherSink)

Example 23 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class SmartWorkflow method destroy.

@Override
public void destroy() {
    WorkflowContext workflowContext = getContext();
    PipelineRuntime pipelineRuntime = new PipelineRuntime(workflowContext, workflowMetrics);
    // Execute the post actions only if pipeline is not running in preview mode.
    if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
        for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
            String name = endingActionEntry.getKey();
            PostAction action = endingActionEntry.getValue();
            StageSpec stageSpec = stageSpecs.get(name);
            BatchActionContext context = new WorkflowBackedActionContext(workflowContext, pipelineRuntime, stageSpec);
            try {
                action.run(context);
            } catch (Throwable t) {
                LOG.error("Error while running post action {}.", name, t);
            }
        }
    }
    // publish all alerts
    for (Map.Entry<String, AlertPublisher> alertPublisherEntry : alertPublishers.entrySet()) {
        String name = alertPublisherEntry.getKey();
        AlertPublisher alertPublisher = alertPublisherEntry.getValue();
        PartitionedFileSet alertConnector = workflowContext.getDataset(name);
        try (CloseableIterator<Alert> alerts = new AlertReader(alertConnector.getPartitions(PartitionFilter.ALWAYS_MATCH))) {
            if (!alerts.hasNext()) {
                continue;
            }
            StageMetrics stageMetrics = new DefaultStageMetrics(workflowMetrics, name);
            StageSpec stageSpec = stageSpecs.get(name);
            AlertPublisherContext alertContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, workflowContext, workflowContext.getAdmin());
            alertPublisher.initialize(alertContext);
            TrackedIterator<Alert> trackedIterator = new TrackedIterator<>(alerts, stageMetrics, Constants.Metrics.RECORDS_IN);
            alertPublisher.publish(trackedIterator);
        } catch (Exception e) {
            LOG.warn("Stage {} had errors publishing alerts. Alerts may not have been published.", name, e);
        } finally {
            try {
                alertPublisher.destroy();
            } catch (Exception e) {
                LOG.warn("Error destroying alert publisher for stage {}", name, e);
            }
        }
    }
    ProgramStatus status = getContext().getState().getStatus();
    if (status == ProgramStatus.FAILED) {
        WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
    } else {
        WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
    }
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), workflowContext.getLogicalStartTime(), workflowContext, workflowContext.getNamespace());
    // Get resolved plugin properties
    Map<String, Map<String, String>> resolvedProperties = new HashMap<>();
    for (StageSpec spec : stageSpecs.values()) {
        String stageName = spec.getName();
        resolvedProperties.put(stageName, workflowContext.getPluginProperties(stageName, macroEvaluator).getProperties());
    }
    // Add resolved plugin properties to workflow token as a JSON String
    workflowContext.getToken().put(RESOLVED_PLUGIN_PROPERTIES_MAP, GSON.toJson(resolvedProperties));
}
Also used : PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) BatchActionContext(co.cask.cdap.etl.api.batch.BatchActionContext) WorkflowBackedActionContext(co.cask.cdap.etl.batch.WorkflowBackedActionContext) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) AlertReader(co.cask.cdap.etl.batch.connector.AlertReader) StageSpec(co.cask.cdap.etl.spec.StageSpec) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) StageMetrics(co.cask.cdap.etl.api.StageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) DefaultAlertPublisherContext(co.cask.cdap.etl.common.DefaultAlertPublisherContext) AlertPublisherContext(co.cask.cdap.etl.api.AlertPublisherContext) AlertPublisher(co.cask.cdap.etl.api.AlertPublisher) TrackedIterator(co.cask.cdap.etl.common.TrackedIterator) WorkflowContext(co.cask.cdap.api.workflow.WorkflowContext) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) DisjointConnectionsException(co.cask.cdap.etl.planner.DisjointConnectionsException) Alert(co.cask.cdap.etl.api.Alert) PostAction(co.cask.cdap.etl.api.batch.PostAction) DefaultAlertPublisherContext(co.cask.cdap.etl.common.DefaultAlertPublisherContext) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) ProgramStatus(co.cask.cdap.api.ProgramStatus)

Example 24 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class SmartWorkflow method addProgram.

private WorkflowProgramAdder addProgram(String phaseName, WorkflowProgramAdder programAdder) {
    PipelinePhase phase = plan.getPhase(phaseName);
    // artificially added by the control dag flattening process. So nothing to add, skip it
    if (phase == null) {
        return programAdder;
    }
    // can't use phase name as a program name because it might contain invalid characters
    String programName = "phase-" + phaseNum;
    phaseNum++;
    BatchPhaseSpec batchPhaseSpec = getPhaseSpec(programName, phase);
    Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
    if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
        // actions will be all by themselves in a phase
        programAdder.addAction(new PipelineAction(batchPhaseSpec));
    } else if (pluginTypes.contains(Condition.PLUGIN_TYPE)) {
        // conditions will be all by themselves in a phase
        // addCondition(programAdder, phaseName, batchPhaseSpec);
        programAdder = programAdder.condition(new PipelineCondition(batchPhaseSpec));
    } else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
        // spark programs will be all by themselves in a phase
        String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
        StageSpec stageSpec = stageSpecs.get(stageName);
        applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
        programAdder.addSpark(programName);
    } else if (useSpark) {
        applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
        programAdder.addSpark(programName);
    } else {
        applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec, new HashSet<>(connectorDatasets.values())));
        programAdder.addMapReduce(programName);
    }
    return programAdder;
}
Also used : ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) PipelineCondition(co.cask.cdap.etl.batch.condition.PipelineCondition) PipelineAction(co.cask.cdap.etl.batch.customaction.PipelineAction)

Example 25 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        sparkConf.set(property.getKey(), property.getValue());
    }
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    context.setSparkConf(sparkConf);
    if (!spec.isCheckpointsDisabled()) {
        // Each pipeline has its own checkpoint directory within the checkpoint fileset.
        // Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
        // This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
        // Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
        // and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
        // On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
        FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
        String pipelineName = context.getApplicationSpecification().getName();
        String checkpointDir = spec.getCheckpointDirectory();
        Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
        Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
        if (!ensureDirExists(pipelineCheckpointBase)) {
            throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
        }
        try {
            for (Location child : pipelineCheckpointBase.list()) {
                if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
                    LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
                }
            }
        } catch (Exception e) {
            LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
        }
        if (!ensureDirExists(pipelineCheckpointDir)) {
            throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
        }
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) IOException(java.io.IOException) IOException(java.io.IOException) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) Location(org.apache.twill.filesystem.Location) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Aggregations

StageSpec (co.cask.cdap.etl.spec.StageSpec)27 HashMap (java.util.HashMap)20 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)15 Map (java.util.Map)10 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)8 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)7 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)7 Connection (co.cask.cdap.etl.proto.Connection)7 HashSet (java.util.HashSet)7 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)6 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)5 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)4 PluginContext (co.cask.cdap.api.plugin.PluginContext)4 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)4 LinkedHashMap (java.util.LinkedHashMap)4 Test (org.junit.Test)4 DatasetContext (co.cask.cdap.api.data.DatasetContext)2 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)2 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)2