use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
SparkClientContext context = getContext();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.speculation", "false");
// turn off auto-broadcast by default until we better understand the implications and can set this to a
// value that we are confident is safe.
sparkConf.set("spark.sql.autoBroadcastJoinThreshold", "-1");
sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
sparkConf.set("spark.network.timeout", "600s");
// Disable yarn app retries since spark already performs retries at a task level.
sparkConf.set("spark.yarn.maxAppAttempts", "1");
// to make sure fields that are the same but different casing are treated as different fields in auto-joins
// see CDAP-17024
sparkConf.set("spark.sql.caseSensitive", "true");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
SparkPreparer preparer = new SparkPreparer(context, context.getMetrics(), evaluator, pipelineRuntime);
List<Finisher> finishers = preparer.prepare(phaseSpec);
finisher = new CompositeFinisher(finishers);
}
use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class DataStreamsSparkLauncher method initialize.
@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
int numSources = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
numSources = numSources + streamingSource.getRequiredExecutors();
}
}
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.streaming.backpressure.enabled", "true");
sparkConf.set("spark.spark.streaming.blockInterval", String.valueOf(spec.getBatchIntervalMillis() / 5));
sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
// spark... makes you set this to at least the number of receivers (streaming sources)
// because it holds one thread per receiver, or one core in distributed mode.
// so... we have to set this hacky master variable based on the isUnitTest setting in the config
String extraOpts = spec.getExtraJavaOpts();
if (extraOpts != null && !extraOpts.isEmpty()) {
sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
}
// without this, stopping will hang on machines with few cores.
sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
sparkConf.setMaster(String.format("local[%d]", numSources + 2));
sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
if (spec.isUnitTest()) {
sparkConf.setMaster(String.format("local[%d]", numSources + 1));
}
// override defaults with any user provided engine configs
int minExecutors = numSources + 1;
for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
if ("spark.executor.instances".equals(property.getKey())) {
// don't let the user set this to something that doesn't make sense
try {
int numExecutors = Integer.parseInt(property.getValue());
if (numExecutors < minExecutors) {
LOG.warn("Number of executors {} is less than the minimum number required to run the pipeline. " + "Automatically increasing it to {}", numExecutors, minExecutors);
numExecutors = minExecutors;
}
sparkConf.set(property.getKey(), String.valueOf(numExecutors));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Number of spark executors was set to invalid value " + property.getValue(), e);
}
} else {
sparkConf.set(property.getKey(), property.getValue());
}
}
context.setSparkConf(sparkConf);
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class CharCountProgram method initialize.
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
context.setSparkConf(new SparkConf().set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec"));
Table totals = context.getDataset("totals");
totals.get(new Get("total").add("total")).getLong("total");
totals.put(new Put("total").add("total", 0L));
}
use of io.cdap.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class ExternalSparkProgram method initialize.
@Override
protected void initialize() throws Exception {
SparkClientContext context = getContext();
BatchPhaseSpec phaseSpec = GSON.fromJson(getContext().getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
SparkConf sparkConf = new SparkConf();
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
context.setSparkConf(sparkConf);
String stageName = context.getSpecification().getProperty(STAGE_NAME);
Class<?> externalProgramClass = context.loadPluginClass(stageName);
// If the external program implements Spark, instantiate it and call initialize() to provide full lifecycle support
if (Spark.class.isAssignableFrom(externalProgramClass)) {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context, context.getNamespace());
delegateSpark = context.newPluginInstance(stageName, macroEvaluator);
if (delegateSpark instanceof AbstractSpark) {
// noinspection unchecked
((AbstractSpark) delegateSpark).initialize(context);
}
}
}
Aggregations