Search in sources :

Example 1 with BatchAutoJoiner

use of io.cdap.cdap.etl.api.batch.BatchAutoJoiner in project cdap by caskdata.

the class PipelinePhasePreparer method prepare.

/**
 * Prepare all the stages in the given phase and return Finishers that must be run when the pipeline completes.
 *
 * @param phaseSpec the pipeline phase to prepare
 * @return list of finishers that should be run when the pipeline ends
 */
public List<Finisher> prepare(PhaseSpec phaseSpec) throws TransactionFailureException, InstantiationException, IOException {
    PipelinePluginInstantiator pluginInstantiator = getPluginInstantiator(phaseSpec);
    PipelinePhase phase = phaseSpec.getPhase();
    List<Finisher> finishers = new ArrayList<>();
    // call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
    for (String stageName : phase.getDag().getTopologicalOrder()) {
        StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSource(batchSource, stageSpec);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSink(batchSink, stageSpec);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType) || ErrorTransform.PLUGIN_TYPE.equals(pluginType)) {
            Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createTransform(transform, stageSpec);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            if (plugin instanceof BatchAggregator) {
                BatchAggregator<?, ?, ?> aggregator = (BatchAggregator) plugin;
                submitterPlugin = createAggregator(aggregator, stageSpec);
            } else if (plugin instanceof BatchReducibleAggregator) {
                BatchReducibleAggregator<?, ?, ?, ?> aggregator = (BatchReducibleAggregator) plugin;
                submitterPlugin = createReducibleAggregator(aggregator, stageSpec);
            } else {
                throw new IllegalStateException(String.format("Aggregator stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
            }
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            if (plugin instanceof BatchJoiner) {
                BatchJoiner<?, ?, ?> batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
                submitterPlugin = createJoiner(batchJoiner, stageSpec);
            } else if (plugin instanceof BatchAutoJoiner) {
                BatchAutoJoiner batchJoiner = (BatchAutoJoiner) plugin;
                validateAutoJoiner(batchJoiner, stageSpec);
                submitterPlugin = createAutoJoiner(batchJoiner, stageSpec);
            } else {
                throw new IllegalStateException(String.format("Join stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
            }
        } else if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
            SplitterTransform<?, ?> splitterTransform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSplitterTransform(splitterTransform, stageSpec);
        } else {
            submitterPlugin = create(pluginInstantiator, stageSpec);
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    return finishers;
}
Also used : BatchSourceContext(io.cdap.cdap.etl.api.batch.BatchSourceContext) ArrayList(java.util.ArrayList) SplitterTransform(io.cdap.cdap.etl.api.SplitterTransform) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) BatchAggregator(io.cdap.cdap.etl.api.batch.BatchAggregator) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) BatchConfigurable(io.cdap.cdap.etl.api.batch.BatchConfigurable) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator)

Example 2 with BatchAutoJoiner

use of io.cdap.cdap.etl.api.batch.BatchAutoJoiner in project cdap by caskdata.

the class JoinMergeFunction method createInitializedJoiner.

private <K, V, O> BatchJoiner<K, V, O> createInitializedJoiner() throws Exception {
    Object plugin = pluginFunctionContext.createPlugin();
    BatchJoiner<K, V, O> joiner;
    if (plugin instanceof BatchAutoJoiner) {
        String stageName = pluginFunctionContext.getStageName();
        BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
        AutoJoinerContext autoJoinerContext = pluginFunctionContext.createAutoJoinerContext();
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        autoJoinerContext.getFailureCollector().getOrThrowException();
        if (joinDefinition == null) {
            throw new IllegalStateException(String.format("Join stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
        }
        joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
    } else {
        joiner = (BatchJoiner<K, V, O>) plugin;
        BatchJoinerRuntimeContext context = pluginFunctionContext.createBatchRuntimeContext();
        joiner.initialize(context);
    }
    return joiner;
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge)

Example 3 with BatchAutoJoiner

use of io.cdap.cdap.etl.api.batch.BatchAutoJoiner in project cdap by caskdata.

the class JoinOnFunction method createInitializedJoinOnTransform.

private JoinOnTransform<INPUT_RECORD, JOIN_KEY> createInitializedJoinOnTransform() throws Exception {
    Object plugin = pluginFunctionContext.createPlugin();
    BatchJoiner<JOIN_KEY, INPUT_RECORD, Object> joiner;
    boolean filterNullKeys = false;
    if (plugin instanceof BatchAutoJoiner) {
        BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
        AutoJoinerContext autoJoinerContext = pluginFunctionContext.createAutoJoinerContext();
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        autoJoinerContext.getFailureCollector().getOrThrowException();
        String stageName = pluginFunctionContext.getStageName();
        if (joinDefinition == null) {
            throw new IllegalStateException(String.format("Join stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
        }
        JoinCondition condition = joinDefinition.getCondition();
        /*
         Filter out the record if it comes from an optional stage
         and the key is null, or if any of the fields in the key is null.
         For example, suppose we are performing a left outer join on:

          A (id, name) = (0, alice), (null, bob)
          B (id, email) = (0, alice@example.com), (null, placeholder@example.com)

         The final output should be:

         joined (A.id, A.name, B.email) = (0, alice, alice@example.com), (null, bob, null, null)

         that is, the bob record should not be joined to the placeholder@example email, even though both their
         ids are null.
       */
        if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
            filterNullKeys = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).anyMatch(s -> s.equals(inputStageName));
        }
        joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
    } else {
        joiner = (BatchJoiner<JOIN_KEY, INPUT_RECORD, Object>) plugin;
        BatchJoinerRuntimeContext context = pluginFunctionContext.createBatchRuntimeContext();
        joiner.initialize(context);
    }
    return new JoinOnTransform<>(joiner, inputStageName, filterNullKeys);
}
Also used : BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) Transformation(io.cdap.cdap.etl.api.Transformation) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Iterator(java.util.Iterator) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge) Tuple2(scala.Tuple2) Schema(io.cdap.cdap.api.data.schema.Schema) Constants(io.cdap.cdap.etl.common.Constants) StructuredRecord(io.cdap.cdap.api.data.format.StructuredRecord) TrackedTransform(io.cdap.cdap.etl.common.TrackedTransform) Emitter(io.cdap.cdap.etl.api.Emitter) DefaultEmitter(io.cdap.cdap.etl.common.DefaultEmitter) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) JoinStage(io.cdap.cdap.etl.api.join.JoinStage) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge)

Example 4 with BatchAutoJoiner

use of io.cdap.cdap.etl.api.batch.BatchAutoJoiner in project cdap by caskdata.

the class MapReduceTransformExecutorFactory method getTransformation.

@SuppressWarnings("unchecked")
@Override
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
    String stageName = stageSpec.getName();
    String pluginType = stageSpec.getPluginType();
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
    TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
    StageStatisticsCollector collector = collectStageStatistics ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
    if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchAggregator<?, ?, ?> batchAggregator;
        if (plugin instanceof BatchReducibleAggregator) {
            BatchReducibleAggregator<?, ?, ?, ?> reducibleAggregator = (BatchReducibleAggregator<?, ?, ?, ?>) plugin;
            batchAggregator = new AggregatorBridge<>(reducibleAggregator);
        } else {
            batchAggregator = (BatchAggregator<?, ?, ?>) plugin;
        }
        BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchAggregator.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        }
    } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchJoiner<?, ?, ?> batchJoiner;
        Set<String> filterNullKeyStages = new HashSet<>();
        if (plugin instanceof BatchAutoJoiner) {
            BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
            FailureCollector failureCollector = new LoggingFailureCollector(stageName, stageSpec.getInputSchemas());
            DefaultAutoJoinerContext context = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
            // definition will be non-null due to validate by PipelinePhasePreparer at the start of the run
            JoinDefinition joinDefinition = autoJoiner.define(context);
            JoinCondition condition = joinDefinition.getCondition();
            // should never happen as it's checked at deployment time, but add this to be safe.
            if (condition.getOp() != JoinCondition.Op.KEY_EQUALITY) {
                failureCollector.addFailure(String.format("Join stage '%s' uses a %s condition, which is not supported with the MapReduce engine.", stageName, condition.getOp()), "Switch to a different execution engine.");
            }
            failureCollector.getOrThrowException();
            batchJoiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
            // this is the same as filtering out records that have a null key if they are from an optional stage
            if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
                filterNullKeyStages = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).collect(Collectors.toSet());
            }
        } else {
            batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
        }
        BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchJoiner.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, filterNullKeyStages), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, getDataTracer(stageName), collector);
        }
    }
    return super.getTransformation(stageSpec);
}
Also used : LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) Set(java.util.Set) HashSet(java.util.HashSet) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) BatchRuntimeContext(io.cdap.cdap.etl.api.batch.BatchRuntimeContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 5 with BatchAutoJoiner

use of io.cdap.cdap.etl.api.batch.BatchAutoJoiner in project cdap by caskdata.

the class SparkStreamingPipelineRunner method handleJoin.

@Override
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
    String stageName = stageSpec.getName();
    BatchJoiner<?, ?, ?> joiner;
    if (plugin instanceof BatchAutoJoiner) {
        BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
        Map<String, Schema> inputSchemas = new HashMap<>();
        for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
            StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
            inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
        }
        FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
        AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
        failureCollector.getOrThrowException();
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        if (joinDefinition == null) {
            throw new IllegalStateException(String.format("Joiner stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
        }
        joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
    } else if (plugin instanceof BatchJoiner) {
        joiner = (BatchJoiner) plugin;
    } else {
        // should never happen unless there is a bug in the code. should have failed during deployment
        throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
    }
    BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
    joiner.initialize(joinerRuntimeContext);
    shufflers.add(stageName);
    return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge)

Aggregations

BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)5 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)4 BatchJoinerRuntimeContext (io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext)4 JoinDefinition (io.cdap.cdap.etl.api.join.JoinDefinition)4 JoinerBridge (io.cdap.cdap.etl.common.plugin.JoinerBridge)4 AutoJoinerContext (io.cdap.cdap.etl.api.join.AutoJoinerContext)3 Schema (io.cdap.cdap.api.data.schema.Schema)2 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)2 BatchReducibleAggregator (io.cdap.cdap.etl.api.batch.BatchReducibleAggregator)2 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)2 DefaultAutoJoinerContext (io.cdap.cdap.etl.common.DefaultAutoJoinerContext)2 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)2 LoggingFailureCollector (io.cdap.cdap.etl.validation.LoggingFailureCollector)2 StructuredRecord (io.cdap.cdap.api.data.format.StructuredRecord)1 Emitter (io.cdap.cdap.etl.api.Emitter)1 SplitterTransform (io.cdap.cdap.etl.api.SplitterTransform)1 StageMetrics (io.cdap.cdap.etl.api.StageMetrics)1 Transformation (io.cdap.cdap.etl.api.Transformation)1 BatchAggregator (io.cdap.cdap.etl.api.batch.BatchAggregator)1 BatchConfigurable (io.cdap.cdap.etl.api.batch.BatchConfigurable)1