Search in sources :

Example 1 with SparkClientContext

use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class CharCountProgram method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    context.setSparkConf(new SparkConf().set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec"));
    Table totals = context.getDataset("totals");
    totals.get(new Get("total").add("total")).getLong("total");
    totals.put(new Put("total").add("total", 0L));
}
Also used : Table(co.cask.cdap.api.dataset.table.Table) Get(co.cask.cdap.api.dataset.table.Get) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) SparkConf(org.apache.spark.SparkConf) Put(co.cask.cdap.api.dataset.table.Put)

Example 2 with SparkClientContext

use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    final SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    List<Finisher> finishers = new ArrayList<>();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
    final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    final Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
    final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    final Admin admin = context.getAdmin();
    PipelinePhase phase = phaseSpec.getPhase();
    // go through in topological order so that arguments set by one stage are seen by stages after it
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
        final StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin = null;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {

                @Override
                public BatchSourceContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
            Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {

                @Override
                public StageSubmitterContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {

                @Override
                public BatchSinkContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {

                @Override
                public SparkPluginContext getContext(DatasetContext datasetContext) {
                    return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {

                @Override
                public void act(DefaultJoinerContext sparkJoinerContext) {
                    stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
                }
            });
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = new CompositeFinisher(finishers);
    context.localize("HydratorSpark.config", configFile.toURI());
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
    }
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SingleConnectorFactory(co.cask.cdap.etl.batch.connector.SingleConnectorFactory) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) SubmitterPlugin(co.cask.cdap.etl.common.submit.SubmitterPlugin) Finisher(co.cask.cdap.etl.common.submit.Finisher) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) StageSubmitterContext(co.cask.cdap.etl.api.StageSubmitterContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(co.cask.cdap.etl.common.BasicArguments) DatasetContext(co.cask.cdap.api.data.DatasetContext) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) ContextProvider(co.cask.cdap.etl.common.submit.ContextProvider) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) Map(java.util.Map) HashMap(java.util.HashMap) File(java.io.File) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) Admin(co.cask.cdap.api.Admin) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Transform(co.cask.cdap.etl.api.Transform) SparkConf(org.apache.spark.SparkConf) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) Writer(java.io.Writer) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Example 3 with SparkClientContext

use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class ExternalSparkProgram method initialize.

@Override
protected void initialize() throws Exception {
    SparkClientContext context = getContext();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m " + sparkConf.get("spark.driver.extraJavaOptions", ""));
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m " + sparkConf.get("spark.executor.extraJavaOptions", ""));
    context.setSparkConf(sparkConf);
    String stageName = context.getSpecification().getProperty(STAGE_NAME);
    Class<?> externalProgramClass = context.loadPluginClass(stageName);
    // If the external program implements Spark, instantiate it and call initialize() to provide full lifecycle support
    if (Spark.class.isAssignableFrom(externalProgramClass)) {
        MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
        delegateSpark = context.newPluginInstance(stageName, macroEvaluator);
        if (delegateSpark instanceof AbstractSpark) {
            // noinspection unchecked
            ((AbstractSpark) delegateSpark).initialize(context);
        }
    }
}
Also used : MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(co.cask.cdap.etl.common.BasicArguments) SparkConf(org.apache.spark.SparkConf) AbstractSpark(co.cask.cdap.api.spark.AbstractSpark)

Example 4 with SparkClientContext

use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        sparkConf.set(property.getKey(), property.getValue());
    }
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    context.setSparkConf(sparkConf);
    if (!spec.isCheckpointsDisabled()) {
        // Each pipeline has its own checkpoint directory within the checkpoint fileset.
        // Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
        // This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
        // Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
        // and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
        // On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
        FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
        String pipelineName = context.getApplicationSpecification().getName();
        String checkpointDir = spec.getCheckpointDirectory();
        Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
        Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
        if (!ensureDirExists(pipelineCheckpointBase)) {
            throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
        }
        try {
            for (Location child : pipelineCheckpointBase.list()) {
                if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
                    LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
                }
            }
        } catch (Exception e) {
            LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
        }
        if (!ensureDirExists(pipelineCheckpointDir)) {
            throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
        }
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) IOException(java.io.IOException) IOException(java.io.IOException) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) Location(org.apache.twill.filesystem.Location) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Aggregations

SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)4 SparkConf (org.apache.spark.SparkConf)4 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)2 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)2 BasicArguments (co.cask.cdap.etl.common.BasicArguments)2 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)2 SparkPipelinePluginContext (co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext)2 StageSpec (co.cask.cdap.etl.spec.StageSpec)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Admin (co.cask.cdap.api.Admin)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 FileSet (co.cask.cdap.api.dataset.lib.FileSet)1 Get (co.cask.cdap.api.dataset.table.Get)1 Put (co.cask.cdap.api.dataset.table.Put)1 Table (co.cask.cdap.api.dataset.table.Table)1 PluginContext (co.cask.cdap.api.plugin.PluginContext)1 AbstractSpark (co.cask.cdap.api.spark.AbstractSpark)1 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)1 StageSubmitterContext (co.cask.cdap.etl.api.StageSubmitterContext)1