Search in sources :

Example 6 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class SmartWorkflow method addProgram.

private void addProgram(String phaseName, WorkflowProgramAdder programAdder) {
    PipelinePhase phase = plan.getPhase(phaseName);
    // artificially added by the control dag flattening process. So nothing to add, skip it
    if (phase == null) {
        return;
    }
    // can't use phase name as a program name because it might contain invalid characters
    String programName = "phase-" + phaseNum;
    phaseNum++;
    // if this phase uses connectors, add the local dataset for that connector if we haven't already
    for (StageInfo connectorInfo : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
        String connectorName = connectorInfo.getName();
        String datasetName = connectorDatasets.get(connectorName);
        if (datasetName == null) {
            datasetName = "conn-" + connectorNum++;
            connectorDatasets.put(connectorName, datasetName);
            // add the local dataset
            ConnectorSource connectorSource = new ConnectorSource(datasetName, null);
            connectorSource.configure(getConfigurer());
        }
    }
    Map<String, String> phaseConnectorDatasets = new HashMap<>();
    for (StageInfo connectorStage : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
        phaseConnectorDatasets.put(connectorStage.getName(), connectorDatasets.get(connectorStage.getName()));
    }
    BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(programName, phase, spec.getResources(), spec.getDriverResources(), spec.getClientResources(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled(), phaseConnectorDatasets, spec.getNumOfRecordsPreview(), spec.getProperties());
    Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
    if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
        // actions will be all by themselves in a phase
        programAdder.addAction(new PipelineAction(batchPhaseSpec));
    } else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
        // spark programs will be all by themselves in a phase
        String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
        StageSpec stageSpec = stageSpecs.get(stageName);
        applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
        programAdder.addSpark(programName);
    } else if (useSpark) {
        applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
        programAdder.addSpark(programName);
    } else {
        applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec));
        programAdder.addMapReduce(programName);
    }
}
Also used : ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) PipelineAction(co.cask.cdap.etl.batch.customaction.PipelineAction) ConnectorSource(co.cask.cdap.etl.batch.connector.ConnectorSource) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec)

Example 7 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class PipelinePlanner method dagToPipeline.

/**
   * Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
   * The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
   *
   * @param pipelineSpec the overall pipeline spec
   * @param dag the dag to convert
   * @param connectors connector nodes across all dags
   * @param specs specifications for every stage
   * @return the converted dag
   */
private PipelinePhase dagToPipeline(PipelineSpec pipelineSpec, Dag dag, Set<String> connectors, Map<String, StageSpec> specs) {
    PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
    for (String stageName : dag.getTopologicalOrder()) {
        Set<String> outputs = dag.getNodeOutputs(stageName);
        if (!outputs.isEmpty()) {
            phaseBuilder.addConnections(stageName, outputs);
        }
        // add connectors
        if (connectors.contains(stageName)) {
            phaseBuilder.addStage(StageInfo.builder(stageName, Constants.CONNECTOR_TYPE).build());
            continue;
        }
        // add other plugin types
        StageSpec spec = specs.get(stageName);
        String pluginType = spec.getPlugin().getType();
        phaseBuilder.addStage(StageInfo.builder(stageName, pluginType).addInputs(spec.getInputs()).addInputSchemas(spec.getInputSchemas()).addOutputs(spec.getOutputs()).setOutputSchema(spec.getOutputSchema()).setErrorSchema(spec.getErrorSchema()).setErrorDatasetName(spec.getErrorDatasetName()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build());
    }
    return phaseBuilder.build();
}
Also used : PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Example 8 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class ETLBatchApplication method configure.

@Override
public void configure() {
    ETLBatchConfig config = getConfig().convertOldConfig();
    setDescription(DEFAULT_DESCRIPTION);
    PipelineSpecGenerator<ETLBatchConfig, BatchPipelineSpec> specGenerator = new BatchPipelineSpecGenerator(getConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE), TimePartitionedFileSet.class, FileSetProperties.builder().setInputFormat(AvroKeyInputFormat.class).setOutputFormat(AvroKeyOutputFormat.class).setEnableExploreOnCreate(true).setSerDe("org.apache.hadoop.hive.serde2.avro.AvroSerDe").setExploreInputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat").setExploreOutputFormat("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat").setTableProperty("avro.schema.literal", Constants.ERROR_SCHEMA.toString()).build(), config.getEngine());
    BatchPipelineSpec spec = specGenerator.generateSpec(config);
    int sourceCount = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (BatchSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            sourceCount++;
        }
    }
    if (sourceCount != 1) {
        throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
    }
    PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
    PipelinePlan plan = planner.plan(spec);
    if (plan.getPhases().size() != 1) {
        // should never happen if there is only one source
        throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
    }
    PipelinePhase pipeline = plan.getPhases().values().iterator().next();
    switch(config.getEngine()) {
        case MAPREDUCE:
            BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(ETLMapReduce.NAME, pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
            addMapReduce(new ETLMapReduce(batchPhaseSpec));
            break;
        case SPARK:
            batchPhaseSpec = new BatchPhaseSpec(ETLSpark.class.getSimpleName(), pipeline, config.getResources(), config.getDriverResources(), config.getClientResources(), config.isStageLoggingEnabled(), config.isProcessTimingEnabled(), new HashMap<String, String>(), config.getNumOfRecordsPreview(), config.getProperties());
            addSpark(new ETLSpark(batchPhaseSpec));
            break;
        default:
            throw new IllegalArgumentException(String.format("Invalid execution engine '%s'. Must be one of %s.", config.getEngine(), Joiner.on(',').join(Engine.values())));
    }
    addWorkflow(new ETLWorkflow(spec, config.getEngine()));
    scheduleWorkflow(Schedules.builder(SCHEDULE_NAME).setDescription("ETL Batch schedule").createTimeSchedule(config.getSchedule()), ETLWorkflow.NAME);
}
Also used : PipelinePlan(co.cask.cdap.etl.planner.PipelinePlan) ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) HashMap(java.util.HashMap) AvroKeyOutputFormat(org.apache.avro.mapreduce.AvroKeyOutputFormat) ETLBatchConfig(co.cask.cdap.etl.proto.v2.ETLBatchConfig) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Example 9 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class SparkStreamingPipelineDriver method run.

@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
    final DataStreamsPipelineSpec pipelineSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(SUPPORTED_PLUGIN_TYPES).addConnections(pipelineSpec.getConnections());
    for (StageSpec stageSpec : pipelineSpec.getStages()) {
        phaseBuilder.addStage(StageInfo.builder(stageSpec.getName(), stageSpec.getPlugin().getType()).addInputs(stageSpec.getInputs()).addOutputs(stageSpec.getOutputs()).addInputSchemas(stageSpec.getInputSchemas()).setOutputSchema(stageSpec.getOutputSchema()).setErrorSchema(stageSpec.getErrorSchema()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build());
    }
    final PipelinePhase pipelinePhase = phaseBuilder.build();
    boolean checkpointsDisabled = pipelineSpec.isCheckpointsDisabled();
    String checkpointDir = null;
    if (!checkpointsDisabled) {
        // Get the location of the checkpoint directory.
        String pipelineName = sec.getApplicationSpecification().getName();
        String relativeCheckpointDir = pipelineSpec.getCheckpointDirectory();
        // there isn't any way to instantiate the fileset except in a TxRunnable, so need to use a reference.
        final AtomicReference<Location> checkpointBaseRef = new AtomicReference<>();
        Transactionals.execute(sec, new TxRunnable() {

            @Override
            public void run(DatasetContext context) throws Exception {
                FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
                checkpointBaseRef.set(checkpointFileSet.getBaseLocation());
            }
        }, Exception.class);
        Location pipelineCheckpointDir = checkpointBaseRef.get().append(pipelineName).append(relativeCheckpointDir);
        checkpointDir = pipelineCheckpointDir.toURI().toString();
    }
    JavaStreamingContext jssc = run(pipelineSpec, pipelinePhase, sec, checkpointDir);
    jssc.start();
    boolean stopped = false;
    try {
        // most programs will just keep running forever.
        // however, when CDAP stops the program, we get an interrupted exception.
        // at that point, we need to call stop on jssc, otherwise the program will hang and never stop.
        stopped = jssc.awaitTerminationOrTimeout(Long.MAX_VALUE);
    } finally {
        if (!stopped) {
            jssc.stop(true, pipelineSpec.isStopGracefully());
        }
    }
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) AtomicReference(java.util.concurrent.atomic.AtomicReference) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) TxRunnable(co.cask.cdap.api.TxRunnable) StageSpec(co.cask.cdap.etl.spec.StageSpec) DatasetContext(co.cask.cdap.api.data.DatasetContext) Location(org.apache.twill.filesystem.Location)

Example 10 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        sparkConf.set(property.getKey(), property.getValue());
    }
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    context.setSparkConf(sparkConf);
    if (!spec.isCheckpointsDisabled()) {
        // Each pipeline has its own checkpoint directory within the checkpoint fileset.
        // Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
        // This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
        // Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
        // and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
        // On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
        FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
        String pipelineName = context.getApplicationSpecification().getName();
        String checkpointDir = spec.getCheckpointDirectory();
        Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
        Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
        if (!ensureDirExists(pipelineCheckpointBase)) {
            throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
        }
        try {
            for (Location child : pipelineCheckpointBase.list()) {
                if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
                    LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
                }
            }
        } catch (Exception e) {
            LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
        }
        if (!ensureDirExists(pipelineCheckpointDir)) {
            throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
        }
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) IOException(java.io.IOException) IOException(java.io.IOException) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) Location(org.apache.twill.filesystem.Location)

Aggregations

StageSpec (co.cask.cdap.etl.spec.StageSpec)10 HashMap (java.util.HashMap)8 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)7 PipelinePlanner (co.cask.cdap.etl.planner.PipelinePlanner)3 Connection (co.cask.cdap.etl.proto.Connection)3 Map (java.util.Map)3 FileSet (co.cask.cdap.api.dataset.lib.FileSet)2 ETLMapReduce (co.cask.cdap.etl.batch.mapreduce.ETLMapReduce)2 PipelinePlan (co.cask.cdap.etl.planner.PipelinePlan)2 ETLSpark (co.cask.cdap.etl.spark.batch.ETLSpark)2 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)2 HashSet (java.util.HashSet)2 LinkedHashMap (java.util.LinkedHashMap)2 Location (org.apache.twill.filesystem.Location)2 TxRunnable (co.cask.cdap.api.TxRunnable)1 ArtifactId (co.cask.cdap.api.artifact.ArtifactId)1 ArtifactVersion (co.cask.cdap.api.artifact.ArtifactVersion)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 Schema (co.cask.cdap.api.data.schema.Schema)1 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)1