Search in sources :

Example 1 with PipelinePluginInstantiator

use of io.cdap.cdap.etl.batch.PipelinePluginInstantiator in project cdap by caskdata.

the class PipelinePhasePreparer method prepare.

/**
 * Prepare all the stages in the given phase and return Finishers that must be run when the pipeline completes.
 *
 * @param phaseSpec the pipeline phase to prepare
 * @return list of finishers that should be run when the pipeline ends
 */
public List<Finisher> prepare(PhaseSpec phaseSpec) throws TransactionFailureException, InstantiationException, IOException {
    PipelinePluginInstantiator pluginInstantiator = getPluginInstantiator(phaseSpec);
    PipelinePhase phase = phaseSpec.getPhase();
    List<Finisher> finishers = new ArrayList<>();
    // call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
    for (String stageName : phase.getDag().getTopologicalOrder()) {
        StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSource(batchSource, stageSpec);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSink(batchSink, stageSpec);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType) || ErrorTransform.PLUGIN_TYPE.equals(pluginType)) {
            Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createTransform(transform, stageSpec);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            if (plugin instanceof BatchAggregator) {
                BatchAggregator<?, ?, ?> aggregator = (BatchAggregator) plugin;
                submitterPlugin = createAggregator(aggregator, stageSpec);
            } else if (plugin instanceof BatchReducibleAggregator) {
                BatchReducibleAggregator<?, ?, ?, ?> aggregator = (BatchReducibleAggregator) plugin;
                submitterPlugin = createReducibleAggregator(aggregator, stageSpec);
            } else {
                throw new IllegalStateException(String.format("Aggregator stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
            }
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            if (plugin instanceof BatchJoiner) {
                BatchJoiner<?, ?, ?> batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
                submitterPlugin = createJoiner(batchJoiner, stageSpec);
            } else if (plugin instanceof BatchAutoJoiner) {
                BatchAutoJoiner batchJoiner = (BatchAutoJoiner) plugin;
                validateAutoJoiner(batchJoiner, stageSpec);
                submitterPlugin = createAutoJoiner(batchJoiner, stageSpec);
            } else {
                throw new IllegalStateException(String.format("Join stage '%s' is of an unsupported class '%s'.", stageSpec.getName(), plugin.getClass().getName()));
            }
        } else if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
            SplitterTransform<?, ?> splitterTransform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
            submitterPlugin = createSplitterTransform(splitterTransform, stageSpec);
        } else {
            submitterPlugin = create(pluginInstantiator, stageSpec);
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    return finishers;
}
Also used : BatchSourceContext(io.cdap.cdap.etl.api.batch.BatchSourceContext) ArrayList(java.util.ArrayList) SplitterTransform(io.cdap.cdap.etl.api.SplitterTransform) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) BatchAggregator(io.cdap.cdap.etl.api.batch.BatchAggregator) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) BatchConfigurable(io.cdap.cdap.etl.api.batch.BatchConfigurable) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator)

Example 2 with PipelinePluginInstantiator

use of io.cdap.cdap.etl.batch.PipelinePluginInstantiator in project cdap by caskdata.

the class BatchSparkPipelineDriver method run.

@Override
public void run(DatasetContext context) throws Exception {
    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
    try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
        String object = reader.readLine();
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
        sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
        sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
        stagePartitions = sourceSinkInfo.getStagePartitions();
    }
    datasetContext = context;
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    Map<String, StageStatisticsCollector> collectors = new HashMap<>();
    if (phaseSpec.pipelineContainsCondition()) {
        Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
        while (iterator.hasNext()) {
            StageSpec spec = iterator.next();
            collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
        }
    }
    boolean isSuccessful = true;
    try {
        PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
        boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
        boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
        boolean isPreviewEnabled = phaseSpec.getPhase().size() == 0 || sec.getDataTracer(phaseSpec.getPhase().iterator().next().getName()).isEnabled();
        // Initialize SQL engine instance if needed.
        if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null) {
            String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
            // Instantiate SQL engine and prepare run.
            try {
                MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
                Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
                sqlEngineAdapter = new BatchSQLEngineAdapter((SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
                sqlEngineAdapter.prepareRun();
            } catch (InstantiationException ie) {
                LOG.error("Could not create plugin instance for SQLEngine class", ie);
            } finally {
                if (sqlEngineAdapter == null) {
                    LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
                }
            }
        }
        runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
    } catch (Throwable t) {
        // Mark this execution as not successful.
        isSuccessful = false;
        // Rethrow
        throw t;
    } finally {
        updateWorkflowToken(sec.getWorkflowToken(), collectors);
        // Close SQL Engine Adapter if neeeded,
        if (sqlEngineAdapter != null) {
            sqlEngineAdapter.onRunFinish(isSuccessful);
            sqlEngineAdapter.close();
        }
    }
}
Also used : Path(java.nio.file.Path) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) HashMap(java.util.HashMap) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) BufferedReader(java.io.BufferedReader) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 3 with PipelinePluginInstantiator

use of io.cdap.cdap.etl.batch.PipelinePluginInstantiator in project cdap by caskdata.

the class MultiSinkFunction method initializeBranchExecutors.

private void initializeBranchExecutors() {
    emitter = new DefaultEmitter<>();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pipelineRuntime.getPluginContext(), pipelineRuntime.getMetrics(), phaseSpec, new SingleConnectorFactory());
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), pipelineRuntime.getLogicalStartTime(), pipelineRuntime.getSecureStore(), pipelineRuntime.getServiceDiscoverer(), pipelineRuntime.getNamespace());
    executorFactory = new SparkTransformExecutorFactory(pluginInstantiator, macroEvaluator, null, collectors, dataTracers, pipelineRuntime, emitter);
    /*
       If the dag is:

            |--> t1 --> k1
       s1 --|
            |--> k2
                 ^
           s2 ---|

       the group is t1, k1, and k2.
     */
    PipelinePhase pipelinePhase = phaseSpec.getPhase();
    branchExecutors = new HashMap<>();
    inputConnections = new HashMap<>();
    for (String groupSource : group) {
        // group "sources" are stages in the group that don't have an input from another stage in the group.
        if (Sets.difference(pipelinePhase.getStageInputs(groupSource), group).isEmpty()) {
            continue;
        }
        // get the branch by taking a subset of the pipeline starting from the "source".
        // with the example above, the two branches are t1 -> k1, and k2.
        PipelinePhase branch;
        if (pipelinePhase.getSinks().contains(groupSource)) {
            // pipelinePhase.subsetFrom() throws an exception if the new "source" is also a sink,
            // since a Dag cannot be a single node. so build it manually.
            branch = PipelinePhase.builder(pipelinePhase.getPluginTypes()).addStage(pipelinePhase.getStage(groupSource)).build();
        } else {
            branch = pipelinePhase.subsetFrom(Collections.singleton(groupSource));
        }
        try {
            branchExecutors.put(groupSource, executorFactory.create(branch));
        } catch (Exception e) {
            throw new IllegalStateException(String.format("Unable to get subset of pipeline starting from stage %s. " + "This indicates a planning error. Please report this bug and turn off stage " + "consolidation by setting %s to false in the runtime arguments.", groupSource, Constants.CONSOLIDATE_STAGES), e);
        }
        /*
          create a mapping from possible inputs to "group sources". This will help identify which incoming
          records should be sent to which branch executor.

          for example, the pipeline may look like:

                           |port a --> k1
             s --> split --|
                           |port b --> k2

          In this scenario, k1, and k2, are all in the same group, so the map contains:

            { stageName: split, port: a, type: output } -> [k1]
            { stageName: split, port: b, type: output } -> [k2]

          A slightly more complicated example:

                               |--> k1
            s1 --> transform --|
                      |        |--> k2
                      |
                      |--> error collector --> k3

          In this scenario, k1, k2, k3, and error collector are in the same group, so the map contains:

            { stageName: transform, type: output } -> [k1, k2]
            { stageName: transform, type: error } -> [k3]
       */
        String groupSourceType = pipelinePhase.getStage(groupSource).getPluginType();
        RecordType recordType = ErrorTransform.PLUGIN_TYPE.equals(groupSourceType) ? RecordType.ERROR : RecordType.OUTPUT;
        for (String inputStage : pipelinePhase.getStageInputs(groupSource)) {
            Map<String, StageSpec.Port> ports = pipelinePhase.getStage(inputStage).getOutputPorts();
            String port = ports.get(groupSource).getPort();
            InputInfo inputInfo = new InputInfo(inputStage, recordType, port);
            Set<String> groupSources = inputConnections.computeIfAbsent(inputInfo, key -> new HashSet<>());
            groupSources.add(groupSource);
        }
    }
}
Also used : DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkTransformExecutorFactory(io.cdap.cdap.etl.spark.SparkTransformExecutorFactory) RecordType(io.cdap.cdap.etl.common.RecordType) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator)

Aggregations

PipelinePluginInstantiator (io.cdap.cdap.etl.batch.PipelinePluginInstantiator)3 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)2 SingleConnectorFactory (io.cdap.cdap.etl.batch.connector.SingleConnectorFactory)2 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)2 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)2 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)2 SplitterTransform (io.cdap.cdap.etl.api.SplitterTransform)1 BatchAggregator (io.cdap.cdap.etl.api.batch.BatchAggregator)1 BatchAutoJoiner (io.cdap.cdap.etl.api.batch.BatchAutoJoiner)1 BatchConfigurable (io.cdap.cdap.etl.api.batch.BatchConfigurable)1 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)1 BatchReducibleAggregator (io.cdap.cdap.etl.api.batch.BatchReducibleAggregator)1 BatchSourceContext (io.cdap.cdap.etl.api.batch.BatchSourceContext)1 SQLEngine (io.cdap.cdap.etl.api.engine.sql.SQLEngine)1 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)1 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)1 RecordType (io.cdap.cdap.etl.common.RecordType)1 StageStatisticsCollector (io.cdap.cdap.etl.common.StageStatisticsCollector)1 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)1 SparkStageStatisticsCollector (io.cdap.cdap.etl.spark.SparkStageStatisticsCollector)1