use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.
the class BatchSparkPipelineDriver method run.
@Override
public void run(DatasetContext context) throws Exception {
BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
String object = reader.readLine();
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
stagePartitions = sourceSinkInfo.getStagePartitions();
}
datasetContext = context;
PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
Map<String, StageStatisticsCollector> collectors = new HashMap<>();
if (phaseSpec.pipelineContainsCondition()) {
Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
while (iterator.hasNext()) {
StageSpec spec = iterator.next();
collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
}
}
boolean isSuccessful = true;
try {
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
boolean isPreviewEnabled = phaseSpec.getPhase().size() == 0 || sec.getDataTracer(phaseSpec.getPhase().iterator().next().getName()).isEnabled();
// Initialize SQL engine instance if needed.
if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null) {
String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
// Instantiate SQL engine and prepare run.
try {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
sqlEngineAdapter = new BatchSQLEngineAdapter((SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
sqlEngineAdapter.prepareRun();
} catch (InstantiationException ie) {
LOG.error("Could not create plugin instance for SQLEngine class", ie);
} finally {
if (sqlEngineAdapter == null) {
LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
}
}
}
runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
} catch (Throwable t) {
// Mark this execution as not successful.
isSuccessful = false;
// Rethrow
throw t;
} finally {
updateWorkflowToken(sec.getWorkflowToken(), collectors);
// Close SQL Engine Adapter if neeeded,
if (sqlEngineAdapter != null) {
sqlEngineAdapter.onRunFinish(isSuccessful);
sqlEngineAdapter.close();
}
}
}
use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.
the class PipelineAction method run.
@Override
public void run() throws Exception {
CustomActionContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePhase phase = phaseSpec.getPhase();
StageSpec stageSpec = phase.iterator().next();
PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace()));
ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
action.run(actionContext);
}
WorkflowToken token = context.getWorkflowToken();
if (token == null) {
throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
}
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.
the class DataStreamsSparkLauncher method initialize.
@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
int numSources = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
numSources = numSources + streamingSource.getRequiredExecutors();
}
}
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.streaming.backpressure.enabled", "true");
sparkConf.set("spark.spark.streaming.blockInterval", String.valueOf(spec.getBatchIntervalMillis() / 5));
sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
// spark... makes you set this to at least the number of receivers (streaming sources)
// because it holds one thread per receiver, or one core in distributed mode.
// so... we have to set this hacky master variable based on the isUnitTest setting in the config
String extraOpts = spec.getExtraJavaOpts();
if (extraOpts != null && !extraOpts.isEmpty()) {
sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
}
// without this, stopping will hang on machines with few cores.
sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
sparkConf.setMaster(String.format("local[%d]", numSources + 2));
sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
if (spec.isUnitTest()) {
sparkConf.setMaster(String.format("local[%d]", numSources + 1));
}
// override defaults with any user provided engine configs
int minExecutors = numSources + 1;
for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
if ("spark.executor.instances".equals(property.getKey())) {
// don't let the user set this to something that doesn't make sense
try {
int numExecutors = Integer.parseInt(property.getValue());
if (numExecutors < minExecutors) {
LOG.warn("Number of executors {} is less than the minimum number required to run the pipeline. " + "Automatically increasing it to {}", numExecutors, minExecutors);
numExecutors = minExecutors;
}
sparkConf.set(property.getKey(), String.valueOf(numExecutors));
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Number of spark executors was set to invalid value " + property.getValue(), e);
}
} else {
sparkConf.set(property.getKey(), property.getValue());
}
}
context.setSparkConf(sparkConf);
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.
the class SparkStreamingPipelineDriver method run.
private JavaStreamingContext run(DataStreamsPipelineSpec pipelineSpec, PipelinePhase pipelinePhase, JavaSparkExecutionContext sec, @Nullable String checkpointDir, @Nullable JavaSparkContext context) throws Exception {
PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
SparkStreamingPreparer preparer = new SparkStreamingPreparer(pluginContext, sec.getMetrics(), evaluator, pipelineRuntime, sec);
try {
SparkFieldLineageRecorder recorder = new SparkFieldLineageRecorder(sec, pipelinePhase, pipelineSpec, preparer);
recorder.record();
} catch (Exception e) {
LOG.warn("Failed to emit field lineage operations for streaming pipeline", e);
}
Set<String> uncombinableSinks = preparer.getUncombinableSinks();
// the content in the function might not run due to spark checkpointing, currently just have the lineage logic
// before anything is run
Function0<JavaStreamingContext> contextFunction = (Function0<JavaStreamingContext>) () -> {
JavaSparkContext javaSparkContext = context == null ? new JavaSparkContext() : context;
JavaStreamingContext jssc = new JavaStreamingContext(javaSparkContext, Durations.milliseconds(pipelineSpec.getBatchIntervalMillis()));
SparkStreamingPipelineRunner runner = new SparkStreamingPipelineRunner(sec, jssc, pipelineSpec, pipelineSpec.isCheckpointsDisabled());
// Seems like they should be set at configure time instead of runtime? but that requires an API change.
try {
PhaseSpec phaseSpec = new PhaseSpec(sec.getApplicationSpecification().getName(), pipelinePhase, Collections.emptyMap(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
runner.runPipeline(phaseSpec, StreamingSource.PLUGIN_TYPE, sec, Collections.emptyMap(), pluginContext, Collections.emptyMap(), uncombinableSinks, shouldConsolidateStages, shouldCacheFunctions);
} catch (Exception e) {
throw new RuntimeException(e);
}
if (checkpointDir != null) {
jssc.checkpoint(checkpointDir);
jssc.sparkContext().hadoopConfiguration().set("fs.defaultFS", checkpointDir);
}
return jssc;
};
return checkpointDir == null ? contextFunction.call() : JavaStreamingContext.getOrCreate(checkpointDir, contextFunction, context.hadoopConfiguration());
}
use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.
the class JavaSparkMainWrapper method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
String stageName = sec.getSpecification().getProperty(ExternalSparkProgram.STAGE_NAME);
BatchPhaseSpec batchPhaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), batchPhaseSpec.isStageLoggingEnabled(), batchPhaseSpec.isProcessTimingEnabled());
Class<?> mainClass = pluginContext.loadPluginClass(stageName);
// if it's a CDAP JavaSparkMain, instantiate it and call the run method
if (JavaSparkMain.class.isAssignableFrom(mainClass)) {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
JavaSparkMain javaSparkMain = pluginContext.newPluginInstance(stageName, macroEvaluator);
javaSparkMain.run(sec);
} else {
// otherwise, assume there is a 'main' method and call it
String programArgs = getProgramArgs(sec, stageName);
String[] args = programArgs == null ? RuntimeArguments.toPosixArray(sec.getRuntimeArguments()) : programArgs.split(" ");
final Method mainMethod = mainClass.getMethod("main", String[].class);
final Object[] methodArgs = new Object[1];
methodArgs[0] = args;
Caller caller = pluginContext.getCaller(stageName);
caller.call(new Callable<Void>() {
@Override
public Void call() throws Exception {
mainMethod.invoke(null, methodArgs);
return null;
}
});
}
}
Aggregations