Search in sources :

Example 1 with SparkClientContext

use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.speculation", "false");
    // turn off auto-broadcast by default until we better understand the implications and can set this to a
    // value that we are confident is safe.
    sparkConf.set("spark.sql.autoBroadcastJoinThreshold", "-1");
    sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
    sparkConf.set("spark.network.timeout", "600s");
    // Disable yarn app retries since spark already performs retries at a task level.
    sparkConf.set("spark.yarn.maxAppAttempts", "1");
    // to make sure fields that are the same but different casing are treated as different fields in auto-joins
    // see CDAP-17024
    sparkConf.set("spark.sql.caseSensitive", "true");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    SparkPreparer preparer = new SparkPreparer(context, context.getMetrics(), evaluator, pipelineRuntime);
    List<Finisher> finishers = preparer.prepare(phaseSpec);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 2 with SparkClientContext

use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    sparkConf.set("spark.spark.streaming.blockInterval", String.valueOf(spec.getBatchIntervalMillis() / 5));
    sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    // override defaults with any user provided engine configs
    int minExecutors = numSources + 1;
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        if ("spark.executor.instances".equals(property.getKey())) {
            // don't let the user set this to something that doesn't make sense
            try {
                int numExecutors = Integer.parseInt(property.getValue());
                if (numExecutors < minExecutors) {
                    LOG.warn("Number of executors {} is less than the minimum number required to run the pipeline. " + "Automatically increasing it to {}", numExecutors, minExecutors);
                    numExecutors = minExecutors;
                }
                sparkConf.set(property.getKey(), String.valueOf(numExecutors));
            } catch (NumberFormatException e) {
                throw new IllegalArgumentException("Number of spark executors was set to invalid value " + property.getValue(), e);
            }
        } else {
            sparkConf.set(property.getKey(), property.getValue());
        }
    }
    context.setSparkConf(sparkConf);
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 3 with SparkClientContext

use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class CharCountProgram method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    context.setSparkConf(new SparkConf().set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec"));
    Table totals = context.getDataset("totals");
    totals.get(new Get("total").add("total")).getLong("total");
    totals.put(new Put("total").add("total", 0L));
}
Also used : Table(io.cdap.cdap.api.dataset.table.Table) Get(io.cdap.cdap.api.dataset.table.Get) SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) SparkConf(org.apache.spark.SparkConf) Put(io.cdap.cdap.api.dataset.table.Put)

Example 4 with SparkClientContext

use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.

the class ExternalSparkProgram method initialize.

@Override
protected void initialize() throws Exception {
    SparkClientContext context = getContext();
    BatchPhaseSpec phaseSpec = GSON.fromJson(getContext().getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    SparkConf sparkConf = new SparkConf();
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    context.setSparkConf(sparkConf);
    String stageName = context.getSpecification().getProperty(STAGE_NAME);
    Class<?> externalProgramClass = context.loadPluginClass(stageName);
    // If the external program implements Spark, instantiate it and call initialize() to provide full lifecycle support
    if (Spark.class.isAssignableFrom(externalProgramClass)) {
        MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context, context.getNamespace());
        delegateSpark = context.newPluginInstance(stageName, macroEvaluator);
        if (delegateSpark instanceof AbstractSpark) {
            // noinspection unchecked
            ((AbstractSpark) delegateSpark).initialize(context);
        }
    }
}
Also used : DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) AbstractSpark(io.cdap.cdap.api.spark.AbstractSpark)

Aggregations

SparkClientContext (io.cdap.cdap.api.spark.SparkClientContext)4 SparkConf (org.apache.spark.SparkConf)4 HashMap (java.util.HashMap)3 Map (java.util.Map)3 TransactionPolicy (io.cdap.cdap.api.annotation.TransactionPolicy)2 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)2 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)2 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)2 Get (io.cdap.cdap.api.dataset.table.Get)1 Put (io.cdap.cdap.api.dataset.table.Put)1 Table (io.cdap.cdap.api.dataset.table.Table)1 AbstractSpark (io.cdap.cdap.api.spark.AbstractSpark)1 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)1 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)1 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)1 CompositeFinisher (io.cdap.cdap.etl.common.submit.CompositeFinisher)1 Finisher (io.cdap.cdap.etl.common.submit.Finisher)1 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)1 SparkPipelinePluginContext (io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext)1