use of io.cdap.cdap.etl.batch.PipelinePluginInstantiator in project cdap by caskdata.
the class PipelinePhasePreparer method prepare.
/**
* Prepare all the stages in the given phase and return Finishers that must be run when the pipeline completes.
*
* @param phaseSpec the pipeline phase to prepare
* @return list of finishers that should be run when the pipeline ends
*/
public List<Finisher> prepare(PhaseSpec phaseSpec) throws TransactionFailureException, InstantiationException, IOException {
PipelinePluginInstantiator pluginInstantiator = getPluginInstantiator(phaseSpec);
PipelinePhase phase = phaseSpec.getPhase();
List<Finisher> finishers = new ArrayList<>();
// call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
for (String stageName : phase.getDag().getTopologicalOrder()) {
StageSpec stageSpec = phase.getStage(stageName);
String pluginType = stageSpec.getPluginType();
boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
SubmitterPlugin submitterPlugin;
if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
submitterPlugin = createSource(batchSource, stageSpec);
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
submitterPlugin = createSink(batchSink, stageSpec);
} else if (Transform.PLUGIN_TYPE.equals(pluginType) || ErrorTransform.PLUGIN_TYPE.equals(pluginType)) {
Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
submitterPlugin = createTransform(transform, stageSpec);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
if (plugin instanceof BatchAggregator) {
BatchAggregator<?, ?, ?> aggregator = (BatchAggregator) plugin;
submitterPlugin = createAggregator(aggregator, stageSpec);
} else if (plugin instanceof BatchReducibleAggregator) {
BatchReducibleAggregator<?, ?, ?, ?> aggregator = (BatchReducibleAggregator) plugin;
submitterPlugin = createReducibleAggregator(aggregator, stageSpec);
} else {
throw new IllegalStateException(String.format("Aggregator stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
}
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
if (plugin instanceof BatchJoiner) {
BatchJoiner<?, ?, ?> batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
submitterPlugin = createJoiner(batchJoiner, stageSpec);
} else if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner batchJoiner = (BatchAutoJoiner) plugin;
validateAutoJoiner(batchJoiner, stageSpec);
submitterPlugin = createAutoJoiner(batchJoiner, stageSpec);
} else {
throw new IllegalStateException(String.format("Join stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
}
} else if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
SplitterTransform<?, ?> splitterTransform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
submitterPlugin = createSplitterTransform(splitterTransform, stageSpec);
} else {
submitterPlugin = create(pluginInstantiator, stageSpec);
}
if (submitterPlugin != null) {
submitterPlugin.prepareRun();
finishers.add(submitterPlugin);
}
}
return finishers;
}
use of io.cdap.cdap.etl.batch.PipelinePluginInstantiator in project cdap by caskdata.
the class BatchSparkPipelineDriver method run.
@Override
public void run(DatasetContext context) throws Exception {
BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
String object = reader.readLine();
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
stagePartitions = sourceSinkInfo.getStagePartitions();
}
datasetContext = context;
PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
Map<String, StageStatisticsCollector> collectors = new HashMap<>();
if (phaseSpec.pipelineContainsCondition()) {
Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
while (iterator.hasNext()) {
StageSpec spec = iterator.next();
collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
}
}
boolean isSuccessful = true;
try {
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
boolean isPreviewEnabled = phaseSpec.getPhase().size() == 0 || sec.getDataTracer(phaseSpec.getPhase().iterator().next().getName()).isEnabled();
// Initialize SQL engine instance if needed.
if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null) {
String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
// Instantiate SQL engine and prepare run.
try {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
sqlEngineAdapter = new BatchSQLEngineAdapter((SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
sqlEngineAdapter.prepareRun();
} catch (InstantiationException ie) {
LOG.error("Could not create plugin instance for SQLEngine class", ie);
} finally {
if (sqlEngineAdapter == null) {
LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
}
}
}
runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
} catch (Throwable t) {
// Mark this execution as not successful.
isSuccessful = false;
// Rethrow
throw t;
} finally {
updateWorkflowToken(sec.getWorkflowToken(), collectors);
// Close SQL Engine Adapter if neeeded,
if (sqlEngineAdapter != null) {
sqlEngineAdapter.onRunFinish(isSuccessful);
sqlEngineAdapter.close();
}
}
}
use of io.cdap.cdap.etl.batch.PipelinePluginInstantiator in project cdap by caskdata.
the class MultiSinkFunction method initializeBranchExecutors.
private void initializeBranchExecutors() {
emitter = new DefaultEmitter<>();
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pipelineRuntime.getPluginContext(), pipelineRuntime.getMetrics(), phaseSpec, new SingleConnectorFactory());
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), pipelineRuntime.getLogicalStartTime(), pipelineRuntime.getSecureStore(), pipelineRuntime.getServiceDiscoverer(), pipelineRuntime.getNamespace());
executorFactory = new SparkTransformExecutorFactory(pluginInstantiator, macroEvaluator, null, collectors, dataTracers, pipelineRuntime, emitter);
/*
If the dag is:
|--> t1 --> k1
s1 --|
|--> k2
^
s2 ---|
the group is t1, k1, and k2.
*/
PipelinePhase pipelinePhase = phaseSpec.getPhase();
branchExecutors = new HashMap<>();
inputConnections = new HashMap<>();
for (String groupSource : group) {
// group "sources" are stages in the group that don't have an input from another stage in the group.
if (Sets.difference(pipelinePhase.getStageInputs(groupSource), group).isEmpty()) {
continue;
}
// get the branch by taking a subset of the pipeline starting from the "source".
// with the example above, the two branches are t1 -> k1, and k2.
PipelinePhase branch;
if (pipelinePhase.getSinks().contains(groupSource)) {
// pipelinePhase.subsetFrom() throws an exception if the new "source" is also a sink,
// since a Dag cannot be a single node. so build it manually.
branch = PipelinePhase.builder(pipelinePhase.getPluginTypes()).addStage(pipelinePhase.getStage(groupSource)).build();
} else {
branch = pipelinePhase.subsetFrom(Collections.singleton(groupSource));
}
try {
branchExecutors.put(groupSource, executorFactory.create(branch));
} catch (Exception e) {
throw new IllegalStateException(String.format("Unable to get subset of pipeline starting from stage %s. " + "This indicates a planning error. Please report this bug and turn off stage " + "consolidation by setting %s to false in the runtime arguments.", groupSource, Constants.CONSOLIDATE_STAGES), e);
}
/*
create a mapping from possible inputs to "group sources". This will help identify which incoming
records should be sent to which branch executor.
for example, the pipeline may look like:
|port a --> k1
s --> split --|
|port b --> k2
In this scenario, k1, and k2, are all in the same group, so the map contains:
{ stageName: split, port: a, type: output } -> [k1]
{ stageName: split, port: b, type: output } -> [k2]
A slightly more complicated example:
|--> k1
s1 --> transform --|
| |--> k2
|
|--> error collector --> k3
In this scenario, k1, k2, k3, and error collector are in the same group, so the map contains:
{ stageName: transform, type: output } -> [k1, k2]
{ stageName: transform, type: error } -> [k3]
*/
String groupSourceType = pipelinePhase.getStage(groupSource).getPluginType();
RecordType recordType = ErrorTransform.PLUGIN_TYPE.equals(groupSourceType) ? RecordType.ERROR : RecordType.OUTPUT;
for (String inputStage : pipelinePhase.getStageInputs(groupSource)) {
Map<String, StageSpec.Port> ports = pipelinePhase.getStage(inputStage).getOutputPorts();
String port = ports.get(groupSource).getPort();
InputInfo inputInfo = new InputInfo(inputStage, recordType, port);
Set<String> groupSources = inputConnections.computeIfAbsent(inputInfo, key -> new HashSet<>());
groupSources.add(groupSource);
}
}
}
Aggregations