Search in sources :

Example 1 with FailureCollector

use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.

the class PipelineSpecGenerator method configureStage.

/**
 * Configures a plugin and returns the spec for it.
 *
 * @param stageName the unique plugin id
 * @param etlPlugin user provided configuration for the plugin
 * @param pipelineConfigurer default pipeline configurer to configure the plugin
 * @return the spec for the plugin
 * @throws IllegalArgumentException if the plugin with same id is already deployed
 * @throws ValidationException if the plugin threw an exception during configuration
 */
public StageSpec.Builder configureStage(String stageName, ETLPlugin etlPlugin, DefaultPipelineConfigurer pipelineConfigurer) throws ValidationException {
    TrackedPluginSelector pluginSelector = new TrackedPluginSelector(new ArtifactSelectorProvider().getPluginSelector(etlPlugin.getArtifactConfig()));
    String type = etlPlugin.getType();
    String pluginName = etlPlugin.getName();
    DefaultStageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
    FailureCollector collector = stageConfigurer.getFailureCollector();
    Object plugin = getPlugin(stageName, etlPlugin, pluginSelector, type, pluginName, collector);
    try {
        if (type.equals(BatchJoiner.PLUGIN_TYPE)) {
            MultiInputPipelineConfigurable multiPlugin = (MultiInputPipelineConfigurable) plugin;
            multiPlugin.configurePipeline(pipelineConfigurer);
            // to the BatchAutoJoiner while preserving backwards compatibility in the pipeline config.
            if (plugin instanceof AutoJoiner) {
                configureAutoJoiner(stageName, (AutoJoiner) plugin, stageConfigurer, collector);
            }
        } else if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
            MultiOutputPipelineConfigurable multiOutputPlugin = (MultiOutputPipelineConfigurable) plugin;
            multiOutputPlugin.configurePipeline(pipelineConfigurer);
        } else if (!type.equals(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
            PipelineConfigurable singlePlugin = (PipelineConfigurable) plugin;
            singlePlugin.configurePipeline(pipelineConfigurer);
            // evaluate macros and find out if there is connection used
            if ((sourcePluginTypes.contains(type) || BatchSink.PLUGIN_TYPE.equals(type)) && runtimeEvaluator == null) {
                pluginConfigurer.evaluateMacros(etlPlugin.getProperties(), connectionEvaluator, options);
            }
        }
    } catch (InvalidConfigPropertyException e) {
        collector.addFailure(e.getMessage(), String.format("Provide valid value for config property '%s'.", e.getProperty())).withConfigProperty(e.getProperty());
    } catch (InvalidStageException e) {
        if (e.getReasons().isEmpty()) {
            collector.addFailure(e.getMessage(), null);
        }
        for (InvalidStageException reason : e.getReasons()) {
            if (reason instanceof InvalidConfigPropertyException) {
                InvalidConfigPropertyException configException = (InvalidConfigPropertyException) reason;
                collector.addFailure(configException.getMessage(), String.format("Provide valid value for config property '%s'.", configException.getProperty())).withConfigProperty(configException.getProperty());
            } else {
                collector.addFailure(reason.getMessage(), null);
            }
        }
    } catch (ValidationException e) {
        throw e;
    } catch (NullPointerException e) {
        // handle the case where plugin throws null pointer exception, this is to avoid having 'null' as error message
        collector.addFailure(String.format("Null error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
    } catch (ArrayIndexOutOfBoundsException e) {
        // handle the case where plugin throws index out of bounds exception,
        // this is to avoid having a number like '2', '8' etc as error message
        collector.addFailure(String.format("Index out of bounds error occurred while configuring the stage %s.", stageName), null).withStacktrace(e.getStackTrace());
    } catch (ConnectionBadRequestException e) {
        collector.addFailure(e.getMessage(), "Provide a valid connection name.");
    } catch (Exception e) {
        collector.addFailure(String.format("Error encountered while configuring the stage: '%s'", e.getMessage()), null).withStacktrace(e.getStackTrace());
    }
    // throw validation exception if there are any errors being carried by failure collector
    collector.getOrThrowException();
    PluginSpec pluginSpec = new PluginSpec(type, pluginName, etlPlugin.getProperties(), pluginSelector.getSelectedArtifact());
    StageSpec.Builder specBuilder = StageSpec.builder(stageName, pluginSpec).addInputSchemas(pipelineConfigurer.getStageConfigurer().getInputSchemas()).setErrorSchema(stageConfigurer.getErrorSchema());
    if (type.equals(SplitterTransform.PLUGIN_TYPE)) {
        specBuilder.setPortSchemas(stageConfigurer.getOutputPortSchemas());
    } else {
        specBuilder.setOutputSchema(stageConfigurer.getOutputSchema());
    }
    return specBuilder;
}
Also used : ArtifactSelectorProvider(io.cdap.cdap.etl.common.ArtifactSelectorProvider) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) InvalidStageException(io.cdap.cdap.etl.api.validation.InvalidStageException) MultiOutputPipelineConfigurable(io.cdap.cdap.etl.api.MultiOutputPipelineConfigurable) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) InvalidConfigPropertyException(io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException) InvalidPluginConfigException(io.cdap.cdap.api.plugin.InvalidPluginConfigException) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) ConnectionBadRequestException(io.cdap.cdap.etl.proto.connection.ConnectionBadRequestException) InvalidStageException(io.cdap.cdap.etl.api.validation.InvalidStageException) ConnectionBadRequestException(io.cdap.cdap.etl.proto.connection.ConnectionBadRequestException) PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) MultiInputPipelineConfigurable(io.cdap.cdap.etl.api.MultiInputPipelineConfigurable) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) InvalidConfigPropertyException(io.cdap.cdap.etl.api.validation.InvalidConfigPropertyException) AutoJoiner(io.cdap.cdap.etl.api.join.AutoJoiner) MultiInputPipelineConfigurable(io.cdap.cdap.etl.api.MultiInputPipelineConfigurable) MultiOutputPipelineConfigurable(io.cdap.cdap.etl.api.MultiOutputPipelineConfigurable) PipelineConfigurable(io.cdap.cdap.etl.api.PipelineConfigurable) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 2 with FailureCollector

use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.

the class PipelinePhasePreparer method validateAutoJoiner.

private void validateAutoJoiner(AutoJoiner autoJoiner, StageSpec stageSpec) {
    // validate that the join definition is not null
    // it could be null at configure time due to macros not being evaluated, but at this
    // point all macros should be evaluated and the definition should be non-null.
    String stageName = stageSpec.getName();
    String pluginName = stageSpec.getPlugin().getName();
    FailureCollector failureCollector = new LoggingFailureCollector(stageSpec.getName(), stageSpec.getInputSchemas());
    AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
    JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
    failureCollector.getOrThrowException();
    if (joinDefinition == null) {
        throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not provide a join definition. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName));
    }
    // validate that the stages mentioned in the join definition are actually inputs into the joiner.
    Set<String> inputStages = stageSpec.getInputSchemas().keySet();
    Set<String> joinStages = joinDefinition.getStages().stream().map(JoinStage::getStageName).collect(Collectors.toSet());
    Set<String> missingInputs = Sets.difference(inputStages, joinStages);
    if (!missingInputs.isEmpty()) {
        throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' did not include input stage %s in the join. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
    }
    Set<String> extraInputs = Sets.difference(joinStages, inputStages);
    if (!extraInputs.isEmpty()) {
        throw new IllegalArgumentException(String.format("Joiner stage '%s' using plugin '%s' is trying to join stage %s, which is not an input. " + "Check with the plugin developer to make sure it is implemented correctly.", stageName, pluginName, String.join(", ", missingInputs)));
    }
}
Also used : LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 3 with FailureCollector

use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.

the class StringValueFilterTransform method configurePipeline.

@Override
public void configurePipeline(PipelineConfigurer pipelineConfigurer) {
    StageConfigurer stageConfigurer = pipelineConfigurer.getStageConfigurer();
    Schema inputSchema = stageConfigurer.getInputSchema();
    FailureCollector collector = stageConfigurer.getFailureCollector();
    if (inputSchema != null && !config.containsMacro("field")) {
        Schema.Field field = inputSchema.getField(config.field);
        if (field == null) {
            collector.addFailure(String.format("'%s' is not a field in the input schema.", config.field), "Make sure field is present in the input schema.").withConfigProperty("field");
            collector.getOrThrowException();
        }
        Schema fieldSchema = field.getSchema();
        Schema.Type fieldType = fieldSchema.isNullable() ? fieldSchema.getNonNullable().getType() : fieldSchema.getType();
        if (fieldType != Schema.Type.STRING) {
            collector.addFailure(String.format("'%s' is of type '%s' instead of a string.", config.field, fieldType), "Make sure provided field is of type string.").withConfigProperty("field").withInputSchemaField(config.field);
        }
    }
    stageConfigurer.setOutputSchema(stageConfigurer.getInputSchema());
}
Also used : StageConfigurer(io.cdap.cdap.etl.api.StageConfigurer) Schema(io.cdap.cdap.api.data.schema.Schema) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 4 with FailureCollector

use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.

the class SparkPipelineRunner method handleJoin.

protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
    String stageName = stageSpec.getName();
    if (plugin instanceof BatchJoiner) {
        BatchJoiner<Object, Object, Object> joiner = (BatchJoiner<Object, Object, Object>) plugin;
        BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
        joiner.initialize(joinerRuntimeContext);
        shufflers.add(stageName);
        return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
    } else if (plugin instanceof AutoJoiner) {
        AutoJoiner autoJoiner = (AutoJoiner) plugin;
        Map<String, Schema> inputSchemas = new HashMap<>();
        for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
            StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
            inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
        }
        FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
        AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
        // joinDefinition will always be non-null because
        // it is checked by PipelinePhasePreparer at the start of the run.
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        failureCollector.getOrThrowException();
        if (joinDefinition.getStages().stream().noneMatch(JoinStage::isBroadcast)) {
            shufflers.add(stageName);
        }
        return handleAutoJoin(stageName, joinDefinition, inputDataCollections, numPartitions);
    } else {
        // should never happen unless there is a bug in the code. should have failed during deployment
        throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
    }
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) AutoJoiner(io.cdap.cdap.etl.api.join.AutoJoiner) Map(java.util.Map) HashMap(java.util.HashMap) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 5 with FailureCollector

use of io.cdap.cdap.etl.api.FailureCollector in project cdap by caskdata.

the class MapReduceTransformExecutorFactory method getTransformation.

@SuppressWarnings("unchecked")
@Override
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
    String stageName = stageSpec.getName();
    String pluginType = stageSpec.getPluginType();
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
    TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
    StageStatisticsCollector collector = collectStageStatistics ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
    if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchAggregator<?, ?, ?> batchAggregator;
        if (plugin instanceof BatchReducibleAggregator) {
            BatchReducibleAggregator<?, ?, ?, ?> reducibleAggregator = (BatchReducibleAggregator<?, ?, ?, ?>) plugin;
            batchAggregator = new AggregatorBridge<>(reducibleAggregator);
        } else {
            batchAggregator = (BatchAggregator<?, ?, ?>) plugin;
        }
        BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchAggregator.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        }
    } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchJoiner<?, ?, ?> batchJoiner;
        Set<String> filterNullKeyStages = new HashSet<>();
        if (plugin instanceof BatchAutoJoiner) {
            BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
            FailureCollector failureCollector = new LoggingFailureCollector(stageName, stageSpec.getInputSchemas());
            DefaultAutoJoinerContext context = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
            // definition will be non-null due to validate by PipelinePhasePreparer at the start of the run
            JoinDefinition joinDefinition = autoJoiner.define(context);
            JoinCondition condition = joinDefinition.getCondition();
            // should never happen as it's checked at deployment time, but add this to be safe.
            if (condition.getOp() != JoinCondition.Op.KEY_EQUALITY) {
                failureCollector.addFailure(String.format("Join stage '%s' uses a %s condition, which is not supported with the MapReduce engine.", stageName, condition.getOp()), "Switch to a different execution engine.");
            }
            failureCollector.getOrThrowException();
            batchJoiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
            // this is the same as filtering out records that have a null key if they are from an optional stage
            if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
                filterNullKeyStages = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).collect(Collectors.toSet());
            }
        } else {
            batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
        }
        BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchJoiner.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, filterNullKeyStages), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, getDataTracer(stageName), collector);
        }
    }
    return super.getTransformation(stageSpec);
}
Also used : LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) Set(java.util.Set) HashSet(java.util.HashSet) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) BatchRuntimeContext(io.cdap.cdap.etl.api.batch.BatchRuntimeContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Aggregations

FailureCollector (io.cdap.cdap.etl.api.FailureCollector)7 JoinDefinition (io.cdap.cdap.etl.api.join.JoinDefinition)4 DefaultAutoJoinerContext (io.cdap.cdap.etl.common.DefaultAutoJoinerContext)4 LoggingFailureCollector (io.cdap.cdap.etl.validation.LoggingFailureCollector)4 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)3 BatchJoinerRuntimeContext (io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext)3 AutoJoinerContext (io.cdap.cdap.etl.api.join.AutoJoinerContext)3 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)3 Schema (io.cdap.cdap.api.data.schema.Schema)2 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)2 AutoJoiner (io.cdap.cdap.etl.api.join.AutoJoiner)2 JoinerBridge (io.cdap.cdap.etl.common.plugin.JoinerBridge)2 HashMap (java.util.HashMap)2 InvalidPluginConfigException (io.cdap.cdap.api.plugin.InvalidPluginConfigException)1 MultiInputPipelineConfigurable (io.cdap.cdap.etl.api.MultiInputPipelineConfigurable)1 MultiOutputPipelineConfigurable (io.cdap.cdap.etl.api.MultiOutputPipelineConfigurable)1 PipelineConfigurable (io.cdap.cdap.etl.api.PipelineConfigurable)1 StageConfigurer (io.cdap.cdap.etl.api.StageConfigurer)1 StageMetrics (io.cdap.cdap.etl.api.StageMetrics)1 BatchReducibleAggregator (io.cdap.cdap.etl.api.batch.BatchReducibleAggregator)1