Search in sources :

Example 6 with SparkPipelinePluginContext

use of co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        sparkConf.set(property.getKey(), property.getValue());
    }
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    context.setSparkConf(sparkConf);
    if (!spec.isCheckpointsDisabled()) {
        // Each pipeline has its own checkpoint directory within the checkpoint fileset.
        // Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
        // This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
        // Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
        // and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
        // On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
        FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
        String pipelineName = context.getApplicationSpecification().getName();
        String checkpointDir = spec.getCheckpointDirectory();
        Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
        Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
        if (!ensureDirExists(pipelineCheckpointBase)) {
            throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
        }
        try {
            for (Location child : pipelineCheckpointBase.list()) {
                if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
                    LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
                }
            }
        } catch (Exception e) {
            LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
        }
        if (!ensureDirExists(pipelineCheckpointDir)) {
            throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
        }
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) IOException(java.io.IOException) IOException(java.io.IOException) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) Location(org.apache.twill.filesystem.Location)

Aggregations

SparkPipelinePluginContext (co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext)6 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)5 PluginContext (co.cask.cdap.api.plugin.PluginContext)4 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)4 TxRunnable (co.cask.cdap.api.TxRunnable)2 DatasetContext (co.cask.cdap.api.data.DatasetContext)2 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)2 SparkPluginContext (co.cask.cdap.etl.api.batch.SparkPluginContext)2 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)2 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 SparkConf (org.apache.spark.SparkConf)2 FileSet (co.cask.cdap.api.dataset.lib.FileSet)1 DataTracer (co.cask.cdap.api.preview.DataTracer)1 JavaSparkMain (co.cask.cdap.api.spark.JavaSparkMain)1 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)1 BatchConfigurable (co.cask.cdap.etl.api.batch.BatchConfigurable)1 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)1 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)1