use of io.cdap.cdap.etl.api.join.AutoJoiner in project cdap by caskdata.
the class PipelineSpecGenerator method configureStage.
/**
* Configures a plugin and returns the spec for it.
*
* @param stageName the unique plugin id
* @param etlPlugin user provided configuration for the plugin
* @param pipelineConfigurer default pipeline configurer to configure the plugin
* @return the spec for the plugin
* @throws IllegalArgumentException if the plugin with same id is already deployed
* @throws ValidationException if the plugin threw an exception during configuration
*/
public StageSpec.Builder configureStage(String stageName, ETLPlugin etlPlugin, DefaultPipelineConfigurer pipelineConfigurer) throws ValidationException {
TrackedPluginSelector pluginSelector = new TrackedPluginSelector(new ArtifactSelectorProvider().getPluginSelector(etlPlugin.getArtifactConfig()));
String type = etlPlugin.getType();
String pluginName = etlPlugin.getName();
DefaultStageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
FailureCollector collector = stageConfigurer.getFailureCollector();
Object plugin = getPlugin(stageName, etlPlugin, pluginSelector, type, pluginName, collector);
try {
if (type.equals(BatchJoiner.PLUGIN_TYPE)) {
MultiInputPipelineConfigurable multiPlugin = (MultiInputPipelineConfigurable) plugin;
multiPlugin.configurePipeline(pipelineConfigurer);
// to the BatchAutoJoiner while preserving backwards compatibility in the pipeline config.
if (plugin instanceof AutoJoiner) {
configureAutoJoiner(stageName, (AutoJoiner) plugin, stageConfigurer, collector);
}
} else if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
MultiOutputPipelineConfigurable multiOutputPlugin = (MultiOutputPipelineConfigurable) plugin;
multiOutputPlugin.configurePipeline(pipelineConfigurer);
} else if (!type.equals(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
PipelineConfigurable singlePlugin = (PipelineConfigurable) plugin;
singlePlugin.configurePipeline(pipelineConfigurer);
// evaluate macros and find out if there is connection used
if ((sourcePluginTypes.contains(type) || BatchSink.PLUGIN_TYPE.equals(type)) && runtimeEvaluator == null) {
pluginConfigurer.evaluateMacros(etlPlugin.getProperties(), connectionEvaluator, options);
}
}
} catch (InvalidConfigPropertyException e) {
collector.addFailure(e.getMessage(), String.format("Provide valid value for config property '%s'.", e.getProperty())).withConfigProperty(e.getProperty());
} catch (InvalidStageException e) {
if (e.getReasons().isEmpty()) {
collector.addFailure(e.getMessage(), null);
}
for (InvalidStageException reason : e.getReasons()) {
if (reason instanceof InvalidConfigPropertyException) {
InvalidConfigPropertyException configException = (InvalidConfigPropertyException) reason;
collector.addFailure(configException.getMessage(), String.format("Provide valid value for config property '%s'.", configException.getProperty())).withConfigProperty(configException.getProperty());
} else {
collector.addFailure(reason.getMessage(), null);
}
}
} catch (ValidationException e) {
throw e;
} catch (NullPointerException e) {
// handle the case where plugin throws null pointer exception, this is to avoid having 'null' as error message
collector.addFailure(String.format("Null error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
} catch (ArrayIndexOutOfBoundsException e) {
// handle the case where plugin throws index out of bounds exception,
// this is to avoid having a number like '2', '8' etc as error message
collector.addFailure(String.format("Index out of bounds error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
} catch (ConnectionBadRequestException e) {
collector.addFailure(e.getMessage(), "Provide a valid connection name.");
} catch (Exception e) {
collector.addFailure(String.format("Error encountered while configuring the stage: '%s'", e.getMessage()), null).withStacktrace(e.getStackTrace());
}
// throw validation exception if there are any errors being carried by failure collector
collector.getOrThrowException();
PluginSpec pluginSpec = new PluginSpec(type, pluginName, etlPlugin.getProperties(), pluginSelector.getSelectedArtifact());
StageSpec.Builder specBuilder = StageSpec.builder(stageName, pluginSpec).addInputSchemas(pipelineConfigurer.getStageConfigurer().getInputSchemas()).setErrorSchema(stageConfigurer.getErrorSchema());
if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
specBuilder.setPortSchemas(stageConfigurer.getOutputPortSchemas());
} else {
specBuilder.setOutputSchema(stageConfigurer.getOutputSchema());
}
return specBuilder;
}
use of io.cdap.cdap.etl.api.join.AutoJoiner in project cdap by caskdata.
the class SparkPipelineRunner method handleJoin.
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
if (plugin instanceof BatchJoiner) {
BatchJoiner<Object, Object, Object> joiner = (BatchJoiner<Object, Object, Object>) plugin;
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
} else if (plugin instanceof AutoJoiner) {
AutoJoiner autoJoiner = (AutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
// joinDefinition will always be non-null because
// it is checked by PipelinePhasePreparer at the start of the run.
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition.getStages().stream().noneMatch(JoinStage::isBroadcast)) {
shufflers.add(stageName);
}
return handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
}
Aggregations