use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.
the class PipelineSpecGenerator method configureStage.
/**
* Configures a plugin and returns the spec for it.
*
* @param stageName the unique plugin id
* @param etlPlugin user provided configuration for the plugin
* @param pipelineConfigurer default pipeline configurer to configure the plugin
* @return the spec for the plugin
* @throws IllegalArgumentException if the plugin with same id is already deployed
* @throws ValidationException if the plugin threw an exception during configuration
*/
public StageSpec.Builder configureStage(String stageName, ETLPlugin etlPlugin, DefaultPipelineConfigurer pipelineConfigurer) throws ValidationException {
TrackedPluginSelector pluginSelector = new TrackedPluginSelector(new ArtifactSelectorProvider().getPluginSelector(etlPlugin.getArtifactConfig()));
String type = etlPlugin.getType();
String pluginName = etlPlugin.getName();
DefaultStageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
FailureCollector collector = stageConfigurer.getFailureCollector();
Object plugin = getPlugin(stageName, etlPlugin, pluginSelector, type, pluginName, collector);
try {
if (type.equals(BatchJoiner.PLUGIN_TYPE)) {
MultiInputPipelineConfigurable multiPlugin = (MultiInputPipelineConfigurable) plugin;
multiPlugin.configurePipeline(pipelineConfigurer);
// to the BatchAutoJoiner while preserving backwards compatibility in the pipeline config.
if (plugin instanceof AutoJoiner) {
configureAutoJoiner(stageName, (AutoJoiner) plugin, stageConfigurer, collector);
}
} else if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
MultiOutputPipelineConfigurable multiOutputPlugin = (MultiOutputPipelineConfigurable) plugin;
multiOutputPlugin.configurePipeline(pipelineConfigurer);
} else if (!type.equals(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
PipelineConfigurable singlePlugin = (PipelineConfigurable) plugin;
singlePlugin.configurePipeline(pipelineConfigurer);
// evaluate macros and find out if there is connection used
if ((sourcePluginTypes.contains(type) || BatchSink.PLUGIN_TYPE.equals(type)) && runtimeEvaluator == null) {
pluginConfigurer.evaluateMacros(etlPlugin.getProperties(), connectionEvaluator, options);
}
}
} catch (InvalidConfigPropertyException e) {
collector.addFailure(e.getMessage(), String.format("Provide valid value for config property '%s'.", e.getProperty())).withConfigProperty(e.getProperty());
} catch (InvalidStageException e) {
if (e.getReasons().isEmpty()) {
collector.addFailure(e.getMessage(), null);
}
for (InvalidStageException reason : e.getReasons()) {
if (reason instanceof InvalidConfigPropertyException) {
InvalidConfigPropertyException configException = (InvalidConfigPropertyException) reason;
collector.addFailure(configException.getMessage(), String.format("Provide valid value for config property '%s'.", configException.getProperty())).withConfigProperty(configException.getProperty());
} else {
collector.addFailure(reason.getMessage(), null);
}
}
} catch (ValidationException e) {
throw e;
} catch (NullPointerException e) {
// handle the case where plugin throws null pointer exception, this is to avoid having 'null' as error message
collector.addFailure(String.format("Null error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
} catch (ArrayIndexOutOfBoundsException e) {
// handle the case where plugin throws index out of bounds exception,
// this is to avoid having a number like '2', '8' etc as error message
collector.addFailure(String.format("Index out of bounds error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
} catch (ConnectionBadRequestException e) {
collector.addFailure(e.getMessage(), "Provide a valid connection name.");
} catch (Exception e) {
collector.addFailure(String.format("Error encountered while configuring the stage: '%s'", e.getMessage()), null).withStacktrace(e.getStackTrace());
}
// throw validation exception if there are any errors being carried by failure collector
collector.getOrThrowException();
PluginSpec pluginSpec = new PluginSpec(type, pluginName, etlPlugin.getProperties(), pluginSelector.getSelectedArtifact());
StageSpec.Builder specBuilder = StageSpec.builder(stageName, pluginSpec).addInputSchemas(pipelineConfigurer.getStageConfigurer().getInputSchemas()).setErrorSchema(stageConfigurer.getErrorSchema());
if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
specBuilder.setPortSchemas(stageConfigurer.getOutputPortSchemas());
} else {
specBuilder.setOutputSchema(stageConfigurer.getOutputSchema());
}
return specBuilder;
}
use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.
the class PipelinePhasePreparer method validateAutoJoiner.
private void validateAutoJoiner(AutoJoiner autoJoiner, StageSpec stageSpec) {
// validate that the join definition is not null
// it could be null at configure time due to macros not being evaluated, but at this
// point all macros should be evaluated and the definition should be non-null.
String stageName = stageSpec.getName();
String pluginName = stageSpec.getPlugin().getName();
FailureCollector failureCollector = new LoggingFailureCollector(stageSpec.getName(), stageSpec.getInputSchemas());
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition == null) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not provide a join definition. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName));
}
// validate that the stages mentioned in the join definition are actually inputs into the joiner.
Set<String> inputStages = stageSpec.getInputSchemas().keySet();
Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
Set<String> missingInputs = Sets.difference(inputStages, joinStages);
if (!missingInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not include input stage %s in the join. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
Set<String> extraInputs = Sets.difference(joinStages, inputStages);
if (!extraInputs.isEmpty()) {
throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' is trying to join stage %s, which is not an input. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
}
}
use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.
the class StringValueFilterTransform method configurePipeline.
@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
Schema inputSchema = stageConfigurer.getInputSchema();
FailureCollector collector = stageConfigurer.getFailureCollector();
if (inputSchema != null && !config.containsMacro("field")) {
Schema.Field field = inputSchema.getField(config.field);
if (field == null) {
collector.addFailure(String.format("'%s' is not a field in the input schema.", config.field), "Make sure field is present in the input schema.").withConfigProperty("field");
collector.getOrThrowException();
}
Schema fieldSchema = field.getSchema();
Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
if (fieldType != Schema.Type.STRING) {
collector.addFailure(String.format("'%s' is of type '%s' instead of a string.", config.field, fieldType), "Make sure provided field is of type string.").withConfigProperty("field").withInputSchemaField(config.field);
}
}
stageConfigurer.setOutputSchema(stageConfigurer.getInputSchema());
}
use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.
the class SparkPipelineRunner method handleJoin.
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
if (plugin instanceof BatchJoiner) {
BatchJoiner<Object, Object, Object> joiner = (BatchJoiner<Object, Object, Object>) plugin;
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
} else if (plugin instanceof AutoJoiner) {
AutoJoiner autoJoiner = (AutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
// joinDefinition will always be non-null because
// it is checked by PipelinePhasePreparer at the start of the run.
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
failureCollector.getOrThrowException();
if (joinDefinition.getStages().stream().noneMatch(JoinStage::isBroadcast)) {
shufflers.add(stageName);
}
return handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
}
use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.
the class MapReduceTransformExecutorFactory method getTransformation.
@SuppressWarnings("unchecked")
@Override
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
String stageName = stageSpec.getName();
String pluginType = stageSpec.getPluginType();
StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
StageStatisticsCollector collector = collectStageStatistics ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchAggregator<?, ?, ?> batchAggregator;
if (plugin instanceof BatchReducibleAggregator) {
BatchReducibleAggregator<?, ?, ?, ?> reducibleAggregator = (BatchReducibleAggregator<?, ?, ?, ?>) plugin;
batchAggregator = new AggregatorBridge<>(reducibleAggregator);
} else {
batchAggregator = (BatchAggregator<?, ?, ?>) plugin;
}
BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
batchAggregator.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
} else {
return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
}
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchJoiner<?, ?, ?> batchJoiner;
Set<String> filterNullKeyStages = new HashSet<>();
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
FailureCollector failureCollector = new LoggingFailureCollector(stageName, stageSpec.getInputSchemas());
DefaultAutoJoinerContext context = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
// definition will be non-null due to validate by PipelinePhasePreparer at the start of the run
JoinDefinition joinDefinition = autoJoiner.define(context);
JoinCondition condition = joinDefinition.getCondition();
// should never happen as it's checked at deployment time, but add this to be safe.
if (condition.getOp() != JoinCondition.Op.KEY_EQUALITY) {
failureCollector.addFailure(String.format("Join stage '%s' uses a %s condition, which is not supported with the MapReduce engine.", stageName, condition.getOp()), "Switch to a different execution engine.");
}
failureCollector.getOrThrowException();
batchJoiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
// this is the same as filtering out records that have a null key if they are from an optional stage
if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
filterNullKeyStages = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).collect(Collectors.toSet());
}
} else {
batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
}
BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
batchJoiner.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, filterNullKeyStages), stageMetrics, getDataTracer(stageName), collector);
} else {
return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, getDataTracer(stageName), collector);
}
}
return super.getTransformation(stageSpec);
}
Aggregations