use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class CharCountProgram method initialize.
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
context.setSparkConf(new SparkConf().set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec"));
Table totals = context.getDataset("totals");
totals.get(new Get("total").add("total")).getLong("total");
totals.put(new Put("total").add("total", 0L));
}
use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
final SparkClientContext context = getContext();
cleanupFiles = new ArrayList<>();
List<Finisher> finishers = new ArrayList<>();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.speculation", "false");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
final Map<String, Integer> stagePartitions = new HashMap<>();
PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
final Admin admin = context.getAdmin();
PipelinePhase phase = phaseSpec.getPhase();
// go through in topological order so that arguments set by one stage are seen by stages after it
for (final String stageName : phase.getDag().getTopologicalOrder()) {
final StageSpec stageSpec = phase.getStage(stageName);
String pluginType = stageSpec.getPluginType();
boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
SubmitterPlugin submitterPlugin = null;
if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {
@Override
public BatchSourceContext getContext(DatasetContext datasetContext) {
return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
} else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {
@Override
public StageSubmitterContext getContext(DatasetContext datasetContext) {
return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {
@Override
public BatchSinkContext getContext(DatasetContext datasetContext) {
return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
} else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {
@Override
public SparkPluginContext getContext(DatasetContext datasetContext) {
return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {
@Override
public void act(DefaultJoinerContext sparkJoinerContext) {
stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
}
});
}
if (submitterPlugin != null) {
submitterPlugin.prepareRun();
finishers.add(submitterPlugin);
}
}
File configFile = File.createTempFile("HydratorSpark", ".config");
cleanupFiles.add(configFile);
try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
writer.write(GSON.toJson(sourceSinkInfo));
}
finisher = new CompositeFinisher(finishers);
context.localize("HydratorSpark.config", configFile.toURI());
WorkflowToken token = context.getWorkflowToken();
if (token != null) {
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
}
use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class ExternalSparkProgram method initialize.
@Override
protected void initialize() throws Exception {
SparkClientContext context = getContext();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m " + sparkConf.get("spark.driver.extraJavaOptions", ""));
sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m " + sparkConf.get("spark.executor.extraJavaOptions", ""));
context.setSparkConf(sparkConf);
String stageName = context.getSpecification().getProperty(STAGE_NAME);
Class<?> externalProgramClass = context.loadPluginClass(stageName);
// If the external program implements Spark, instantiate it and call initialize() to provide full lifecycle support
if (Spark.class.isAssignableFrom(externalProgramClass)) {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
delegateSpark = context.newPluginInstance(stageName, macroEvaluator);
if (delegateSpark instanceof AbstractSpark) {
// noinspection unchecked
((AbstractSpark) delegateSpark).initialize(context);
}
}
}
use of co.cask.cdap.api.spark.SparkClientContext in project cdap by caskdata.
the class DataStreamsSparkLauncher method initialize.
@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
int numSources = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
numSources = numSources + streamingSource.getRequiredExecutors();
}
}
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.streaming.backpressure.enabled", "true");
for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
sparkConf.set(property.getKey(), property.getValue());
}
// spark... makes you set this to at least the number of receivers (streaming sources)
// because it holds one thread per receiver, or one core in distributed mode.
// so... we have to set this hacky master variable based on the isUnitTest setting in the config
String extraOpts = spec.getExtraJavaOpts();
if (extraOpts != null && !extraOpts.isEmpty()) {
sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
}
// without this, stopping will hang on machines with few cores.
sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
sparkConf.setMaster(String.format("local[%d]", numSources + 2));
if (spec.isUnitTest()) {
sparkConf.setMaster(String.format("local[%d]", numSources + 1));
}
context.setSparkConf(sparkConf);
if (!spec.isCheckpointsDisabled()) {
// Each pipeline has its own checkpoint directory within the checkpoint fileset.
// Ideally, when a pipeline is deleted, we would be able to delete that checkpoint directory.
// This is because we don't want another pipeline created with the same name to pick up the old checkpoint.
// Since CDAP has no way to run application logic on deletion, we instead generate a unique pipeline id
// and use that as the checkpoint directory as a subdirectory inside the pipeline name directory.
// On start, we check for any other pipeline ids for that pipeline name, and delete them if they exist.
FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
String pipelineName = context.getApplicationSpecification().getName();
String checkpointDir = spec.getCheckpointDirectory();
Location pipelineCheckpointBase = checkpointFileSet.getBaseLocation().append(pipelineName);
Location pipelineCheckpointDir = pipelineCheckpointBase.append(checkpointDir);
if (!ensureDirExists(pipelineCheckpointBase)) {
throw new IOException(String.format("Unable to create checkpoint base directory '%s' for the pipeline.", pipelineCheckpointBase));
}
try {
for (Location child : pipelineCheckpointBase.list()) {
if (!child.equals(pipelineCheckpointDir) && !child.delete(true)) {
LOG.warn("Unable to delete checkpoint directory {} from an old pipeline.", child);
}
}
} catch (Exception e) {
LOG.warn("Unable to clean up old checkpoint directories from old pipelines.", e);
}
if (!ensureDirExists(pipelineCheckpointDir)) {
throw new IOException(String.format("Unable to create checkpoint directory '%s' for the pipeline.", pipelineCheckpointDir));
}
}
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Aggregations