Search in sources :

Example 1 with BatchReducibleAggregator

use of io.cdap.cdap.etl.api.batch.BatchReducibleAggregator in project cdap by caskdata.

the class PipelinePhasePreparer method prepare.

/**
 * Prepare all the stages in the given phase and return Finishers that must be run when the pipeline completes.
 *
 * @param phaseSpec the pipeline phase to prepare
 * @return list of finishers that should be run when the pipeline ends
 */
public List<Finisher> prepare(PhaseSpec phaseSpec) throws TransactionFailureException, InstantiationException, IOException {
    PipelinePluginInstantiator pluginInstantiator = getPluginInstantiator(phaseSpec);
    PipelinePhase phase = phaseSpec.getPhase();
    List<Finisher> finishers = new ArrayList<>();
    // call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
    for (String stageName : phase.getDag().getTopologicalOrder()) {
        StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSource(batchSource, stageSpec);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSink(batchSink, stageSpec);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType) || ErrorTransform.PLUGIN_TYPE.equals(pluginType)) {
            Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createTransform(transform, stageSpec);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            if (plugin instanceof BatchAggregator) {
                BatchAggregator<?, ?, ?> aggregator = (BatchAggregator) plugin;
                submitterPlugin = createAggregator(aggregator, stageSpec);
            } else if (plugin instanceof BatchReducibleAggregator) {
                BatchReducibleAggregator<?, ?, ?, ?> aggregator = (BatchReducibleAggregator) plugin;
                submitterPlugin = createReducibleAggregator(aggregator, stageSpec);
            } else {
                throw new IllegalStateException(String.format("Aggregator stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
            }
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            if (plugin instanceof BatchJoiner) {
                BatchJoiner<?, ?, ?> batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
                submitterPlugin = createJoiner(batchJoiner, stageSpec);
            } else if (plugin instanceof BatchAutoJoiner) {
                BatchAutoJoiner batchJoiner = (BatchAutoJoiner) plugin;
                validateAutoJoiner(batchJoiner, stageSpec);
                submitterPlugin = createAutoJoiner(batchJoiner, stageSpec);
            } else {
                throw new IllegalStateException(String.format("Join stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
            }
        } else if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
            SplitterTransform<?, ?> splitterTransform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSplitterTransform(splitterTransform, stageSpec);
        } else {
            submitterPlugin = create(pluginInstantiator, stageSpec);
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    return finishers;
}
Also used : BatchSourceContext(io.cdap.cdap.etl.api.batch.BatchSourceContext) ArrayList(java.util.ArrayList) SplitterTransform(io.cdap.cdap.etl.api.SplitterTransform) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) BatchAggregator(io.cdap.cdap.etl.api.batch.BatchAggregator) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) BatchConfigurable(io.cdap.cdap.etl.api.batch.BatchConfigurable) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator)

Example 2 with BatchReducibleAggregator

use of io.cdap.cdap.etl.api.batch.BatchReducibleAggregator in project cdap by caskdata.

the class MapReduceTransformExecutorFactory method getTransformation.

@SuppressWarnings("unchecked")
@Override
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
    String stageName = stageSpec.getName();
    String pluginType = stageSpec.getPluginType();
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
    TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
    StageStatisticsCollector collector = collectStageStatistics ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
    if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchAggregator<?, ?, ?> batchAggregator;
        if (plugin instanceof BatchReducibleAggregator) {
            BatchReducibleAggregator<?, ?, ?, ?> reducibleAggregator = (BatchReducibleAggregator<?, ?, ?, ?>) plugin;
            batchAggregator = new AggregatorBridge<>(reducibleAggregator);
        } else {
            batchAggregator = (BatchAggregator<?, ?, ?>) plugin;
        }
        BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchAggregator.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        }
    } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchJoiner<?, ?, ?> batchJoiner;
        Set<String> filterNullKeyStages = new HashSet<>();
        if (plugin instanceof BatchAutoJoiner) {
            BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
            FailureCollector failureCollector = new LoggingFailureCollector(stageName, stageSpec.getInputSchemas());
            DefaultAutoJoinerContext context = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
            // definition will be non-null due to validate by PipelinePhasePreparer at the start of the run
            JoinDefinition joinDefinition = autoJoiner.define(context);
            JoinCondition condition = joinDefinition.getCondition();
            // should never happen as it's checked at deployment time, but add this to be safe.
            if (condition.getOp() != JoinCondition.Op.KEY_EQUALITY) {
                failureCollector.addFailure(String.format("Join stage '%s' uses a %s condition, which is not supported with the MapReduce engine.", stageName, condition.getOp()), "Switch to a different execution engine.");
            }
            failureCollector.getOrThrowException();
            batchJoiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
            // this is the same as filtering out records that have a null key if they are from an optional stage
            if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
                filterNullKeyStages = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).collect(Collectors.toSet());
            }
        } else {
            batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
        }
        BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchJoiner.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, filterNullKeyStages), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, getDataTracer(stageName), collector);
        }
    }
    return super.getTransformation(stageSpec);
}
Also used : LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) Set(java.util.Set) HashSet(java.util.HashSet) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) BatchRuntimeContext(io.cdap.cdap.etl.api.batch.BatchRuntimeContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 3 with BatchReducibleAggregator

use of io.cdap.cdap.etl.api.batch.BatchReducibleAggregator in project cdap by caskdata.

the class PipelinePluginContext method wrapPlugin.

private Object wrapPlugin(String pluginId, Object plugin) {
    Caller caller = getCaller(pluginId);
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, pluginId);
    OperationTimer operationTimer = processTimingEnabled ? new MetricsOperationTimer(stageMetrics) : NoOpOperationTimer.INSTANCE;
    if (plugin instanceof Action) {
        return new WrappedAction((Action) plugin, caller);
    } else if (plugin instanceof BatchSource) {
        return new WrappedBatchSource<>((BatchSource) plugin, caller, operationTimer);
    } else if (plugin instanceof BatchSink) {
        return new WrappedBatchSink<>((BatchSink) plugin, caller, operationTimer);
    } else if (plugin instanceof ErrorTransform) {
        return new WrappedErrorTransform<>((ErrorTransform) plugin, caller, operationTimer);
    } else if (plugin instanceof Transform) {
        return new WrappedTransform<>((Transform) plugin, caller, operationTimer);
    } else if (plugin instanceof BatchReducibleAggregator) {
        return new WrappedReduceAggregator<>((BatchReducibleAggregator) plugin, caller, operationTimer);
    } else if (plugin instanceof BatchAggregator) {
        return new WrappedBatchAggregator<>((BatchAggregator) plugin, caller, operationTimer);
    } else if (plugin instanceof BatchJoiner) {
        return new WrappedBatchJoiner<>((BatchJoiner) plugin, caller, operationTimer);
    } else if (plugin instanceof PostAction) {
        return new WrappedPostAction((PostAction) plugin, caller);
    } else if (plugin instanceof SplitterTransform) {
        return new WrappedSplitterTransform<>((SplitterTransform) plugin, caller, operationTimer);
    }
    return wrapUnknownPlugin(pluginId, plugin, caller);
}
Also used : Action(io.cdap.cdap.etl.api.action.Action) PostAction(io.cdap.cdap.etl.api.batch.PostAction) BatchSource(io.cdap.cdap.etl.api.batch.BatchSource) BatchAggregator(io.cdap.cdap.etl.api.batch.BatchAggregator) BatchSink(io.cdap.cdap.etl.api.batch.BatchSink) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator) SplitterTransform(io.cdap.cdap.etl.api.SplitterTransform) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) ErrorTransform(io.cdap.cdap.etl.api.ErrorTransform) PostAction(io.cdap.cdap.etl.api.batch.PostAction) ErrorTransform(io.cdap.cdap.etl.api.ErrorTransform) SplitterTransform(io.cdap.cdap.etl.api.SplitterTransform) Transform(io.cdap.cdap.etl.api.Transform) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics)

Aggregations

BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)3 BatchReducibleAggregator (io.cdap.cdap.etl.api.batch.BatchReducibleAggregator)3 SplitterTransform (io.cdap.cdap.etl.api.SplitterTransform)2 StageMetrics (io.cdap.cdap.etl.api.StageMetrics)2 BatchAggregator (io.cdap.cdap.etl.api.batch.BatchAggregator)2 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)2 DefaultStageMetrics (io.cdap.cdap.etl.common.DefaultStageMetrics)2 ErrorTransform (io.cdap.cdap.etl.api.ErrorTransform)1 FailureCollector (io.cdap.cdap.etl.api.FailureCollector)1 Transform (io.cdap.cdap.etl.api.Transform)1 Action (io.cdap.cdap.etl.api.action.Action)1 BatchConfigurable (io.cdap.cdap.etl.api.batch.BatchConfigurable)1 BatchJoinerRuntimeContext (io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext)1 BatchRuntimeContext (io.cdap.cdap.etl.api.batch.BatchRuntimeContext)1 BatchSink (io.cdap.cdap.etl.api.batch.BatchSink)1 BatchSource (io.cdap.cdap.etl.api.batch.BatchSource)1 BatchSourceContext (io.cdap.cdap.etl.api.batch.BatchSourceContext)1 PostAction (io.cdap.cdap.etl.api.batch.PostAction)1 JoinCondition (io.cdap.cdap.etl.api.join.JoinCondition)1 JoinDefinition (io.cdap.cdap.etl.api.join.JoinDefinition)1