use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.
the class JavaSparkMainWrapper method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
String stageName = sec.getSpecification().getProperty(ExternalSparkProgram.STAGE_NAME);
BatchPhaseSpec batchPhaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), batchPhaseSpec.isStageLoggingEnabled(), batchPhaseSpec.isProcessTimingEnabled());
Class<?> mainClass = pluginContext.loadPluginClass(stageName);
// if it's a CDAP JavaSparkMain, instantiate it and call the run method
if (JavaSparkMain.class.isAssignableFrom(mainClass)) {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getNamespace());
JavaSparkMain javaSparkMain = pluginContext.newPluginInstance(stageName, macroEvaluator);
javaSparkMain.run(sec);
} else {
// otherwise, assume there is a 'main' method and call it
String programArgs = getProgramArgs(sec, stageName);
String[] args = programArgs == null ? RuntimeArguments.toPosixArray(sec.getRuntimeArguments()) : programArgs.split(" ");
final Method mainMethod = mainClass.getMethod("main", String[].class);
final Object[] methodArgs = new Object[1];
methodArgs[0] = args;
Caller caller = pluginContext.getCaller(stageName);
caller.call(new Callable<Void>() {
@Override
public Void call() throws Exception {
mainMethod.invoke(null, methodArgs);
return null;
}
});
}
}
use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.
the class ETLMapReduce method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
final MapReduceContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
List<Finisher> finishers = new ArrayList<>();
final Job job = context.getHadoopJob();
final Configuration hConf = job.getConfiguration();
hConf.setBoolean("mapreduce.map.speculative", false);
hConf.setBoolean("mapreduce.reduce.speculative", false);
// plugin name -> runtime args for that plugin
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace());
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
final PipelinePhase phase = phaseSpec.getPhase();
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec, new MultiConnectorFactory());
// should never happen if planner is correct
Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
if (reducers.size() > 1) {
Iterator<StageSpec> reducerIter = reducers.iterator();
StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
while (reducerIter.hasNext()) {
reducersStr.append(",");
reducersStr.append(reducerIter.next().getName());
}
throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
}
job.setMapperClass(ETLMapper.class);
if (reducers.isEmpty()) {
job.setNumReduceTasks(0);
} else {
job.setReducerClass(ETLReducer.class);
}
final Map<String, SinkOutput> sinkOutputs = new HashMap<>();
final Map<String, String> inputAliasToStage = new HashMap<>();
// call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
for (final String stageName : phase.getDag().getTopologicalOrder()) {
final StageSpec stageSpec = phase.getStage(stageName);
String pluginType = stageSpec.getPluginType();
boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
SubmitterPlugin submitterPlugin = null;
if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSource, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {
@Override
public void act(MapReduceBatchContext sourceContext) {
for (String inputAlias : sourceContext.getInputNames()) {
inputAliasToStage.put(inputAlias, stageName);
}
}
});
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSink, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {
@Override
public void act(MapReduceBatchContext sinkContext) {
sinkOutputs.put(stageName, new SinkOutput(sinkContext.getOutputNames()));
}
});
} else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, transform, contextProvider);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
final BatchAggregator<?, ?, ?> aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
submitterPlugin = new SubmitterPlugin<>(stageName, context, aggregator, contextProvider, new SubmitterPlugin.PrepareAction<DefaultAggregatorContext>() {
@Override
public void act(DefaultAggregatorContext aggregatorContext) {
if (aggregatorContext.getNumPartitions() != null) {
job.setNumReduceTasks(aggregatorContext.getNumPartitions());
}
Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
Class<?> outputValClass = aggregatorContext.getGroupValueClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
}
if (outputValClass == null) {
outputValClass = TypeChecker.getGroupValueClass(aggregator);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, outputValClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
job.setMapOutputValueClass(getOutputValClass(stageName, outputValClass));
}
});
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
final BatchJoiner<?, ?, ?> batchJoiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchJoiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {
@Override
public void act(DefaultJoinerContext joinerContext) {
if (joinerContext.getNumPartitions() != null) {
job.setNumReduceTasks(joinerContext.getNumPartitions());
}
Class<?> outputKeyClass = joinerContext.getJoinKeyClass();
Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
}
if (inputRecordClass == null) {
inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
getOutputValClass(stageName, inputRecordClass);
// for joiner plugin map output is tagged with stageName
job.setMapOutputValueClass(TaggedWritable.class);
}
});
}
if (submitterPlugin != null) {
submitterPlugin.prepareRun();
finishers.add(submitterPlugin);
}
}
hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
finisher = new CompositeFinisher(finishers);
job.setMapperClass(ETLMapper.class);
WorkflowToken token = context.getWorkflowToken();
if (token != null) {
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
// token is null when just the mapreduce job is run but not the entire workflow
// we still want things to work in that case.
hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(pipelineRuntime.getArguments().asMap()));
}
use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.
the class PipelineCondition method apply.
@Override
public boolean apply(@Nullable WorkflowContext input) {
if (input == null) {
// should not happen
throw new IllegalStateException("WorkflowContext for the Condition cannot be null.");
}
Map<String, String> properties = input.getConditionSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePhase phase = phaseSpec.getPhase();
StageSpec stageSpec = phase.iterator().next();
PluginContext pluginContext = new PipelinePluginContext(input, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(input.getToken(), input.getRuntimeArguments()), input.getLogicalStartTime(), input, input.getNamespace());
try {
Condition condition = pluginContext.newPluginInstance(stageSpec.getName(), macroEvaluator);
PipelineRuntime pipelineRuntime = new PipelineRuntime(input, metrics);
ConditionContext conditionContext = new BasicConditionContext(input, pipelineRuntime, stageSpec);
boolean result = condition.apply(conditionContext);
WorkflowToken token = input.getToken();
if (token == null) {
throw new IllegalStateException("WorkflowToken cannot be null when Condition is executed through Workflow.");
}
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
return result;
} catch (Exception e) {
String msg = String.format("Error executing condition '%s' in the pipeline.", stageSpec.getName());
throw new RuntimeException(msg, e);
}
}
use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.
the class PipelineAction method run.
@Override
public void run() throws Exception {
CustomActionContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePhase phase = phaseSpec.getPhase();
StageSpec stageSpec = phase.iterator().next();
PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace()));
ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
action.run(actionContext);
}
WorkflowToken token = context.getWorkflowToken();
if (token == null) {
throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
}
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
final SparkClientContext context = getContext();
cleanupFiles = new ArrayList<>();
List<Finisher> finishers = new ArrayList<>();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.speculation", "false");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
final Map<String, Integer> stagePartitions = new HashMap<>();
PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
final Admin admin = context.getAdmin();
PipelinePhase phase = phaseSpec.getPhase();
// go through in topological order so that arguments set by one stage are seen by stages after it
for (final String stageName : phase.getDag().getTopologicalOrder()) {
final StageSpec stageSpec = phase.getStage(stageName);
String pluginType = stageSpec.getPluginType();
boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
SubmitterPlugin submitterPlugin = null;
if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {
@Override
public BatchSourceContext getContext(DatasetContext datasetContext) {
return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
} else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {
@Override
public StageSubmitterContext getContext(DatasetContext datasetContext) {
return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {
@Override
public BatchSinkContext getContext(DatasetContext datasetContext) {
return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
} else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {
@Override
public SparkPluginContext getContext(DatasetContext datasetContext) {
return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
}
};
submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {
@Override
public void act(DefaultJoinerContext sparkJoinerContext) {
stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
}
});
}
if (submitterPlugin != null) {
submitterPlugin.prepareRun();
finishers.add(submitterPlugin);
}
}
File configFile = File.createTempFile("HydratorSpark", ".config");
cleanupFiles.add(configFile);
try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
writer.write(GSON.toJson(sourceSinkInfo));
}
finisher = new CompositeFinisher(finishers);
context.localize("HydratorSpark.config", configFile.toURI());
WorkflowToken token = context.getWorkflowToken();
if (token != null) {
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
}
Aggregations