Search in sources :

Example 1 with BatchPhaseSpec

use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class JavaSparkMainWrapper method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    String stageName = sec.getSpecification().getProperty(ExternalSparkProgram.STAGE_NAME);
    BatchPhaseSpec batchPhaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), batchPhaseSpec.isStageLoggingEnabled(), batchPhaseSpec.isProcessTimingEnabled());
    Class<?> mainClass = pluginContext.loadPluginClass(stageName);
    // if it's a CDAP JavaSparkMain, instantiate it and call the run method
    if (JavaSparkMain.class.isAssignableFrom(mainClass)) {
        MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getNamespace());
        JavaSparkMain javaSparkMain = pluginContext.newPluginInstance(stageName, macroEvaluator);
        javaSparkMain.run(sec);
    } else {
        // otherwise, assume there is a 'main' method and call it
        String programArgs = getProgramArgs(sec, stageName);
        String[] args = programArgs == null ? RuntimeArguments.toPosixArray(sec.getRuntimeArguments()) : programArgs.split(" ");
        final Method mainMethod = mainClass.getMethod("main", String[].class);
        final Object[] methodArgs = new Object[1];
        methodArgs[0] = args;
        Caller caller = pluginContext.getCaller(stageName);
        caller.call(new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                mainMethod.invoke(null, methodArgs);
                return null;
            }
        });
    }
}
Also used : MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) Method(java.lang.reflect.Method) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) Caller(co.cask.cdap.etl.common.plugin.Caller) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) JavaSparkMain(co.cask.cdap.api.spark.JavaSparkMain) BasicArguments(co.cask.cdap.etl.common.BasicArguments) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext)

Example 2 with BatchPhaseSpec

use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    final MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
    List<Finisher> finishers = new ArrayList<>();
    final Job job = context.getHadoopJob();
    final Configuration hConf = job.getConfiguration();
    hConf.setBoolean("mapreduce.map.speculative", false);
    hConf.setBoolean("mapreduce.reduce.speculative", false);
    // plugin name -> runtime args for that plugin
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    final PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec, new MultiConnectorFactory());
    // should never happen if planner is correct
    Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
        Iterator<StageSpec> reducerIter = reducers.iterator();
        StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
        while (reducerIter.hasNext()) {
            reducersStr.append(",");
            reducersStr.append(reducerIter.next().getName());
        }
        throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
    }
    job.setMapperClass(ETLMapper.class);
    if (reducers.isEmpty()) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(ETLReducer.class);
    }
    final Map<String, SinkOutput> sinkOutputs = new HashMap<>();
    final Map<String, String> inputAliasToStage = new HashMap<>();
    // call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
        final StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin = null;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSource, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {

                @Override
                public void act(MapReduceBatchContext sourceContext) {
                    for (String inputAlias : sourceContext.getInputNames()) {
                        inputAliasToStage.put(inputAlias, stageName);
                    }
                }
            });
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSink, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {

                @Override
                public void act(MapReduceBatchContext sinkContext) {
                    sinkOutputs.put(stageName, new SinkOutput(sinkContext.getOutputNames()));
                }
            });
        } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
            Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, transform, contextProvider);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            final BatchAggregator<?, ?, ?> aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
            submitterPlugin = new SubmitterPlugin<>(stageName, context, aggregator, contextProvider, new SubmitterPlugin.PrepareAction<DefaultAggregatorContext>() {

                @Override
                public void act(DefaultAggregatorContext aggregatorContext) {
                    if (aggregatorContext.getNumPartitions() != null) {
                        job.setNumReduceTasks(aggregatorContext.getNumPartitions());
                    }
                    Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
                    Class<?> outputValClass = aggregatorContext.getGroupValueClass();
                    if (outputKeyClass == null) {
                        outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
                    }
                    if (outputValClass == null) {
                        outputValClass = TypeChecker.getGroupValueClass(aggregator);
                    }
                    hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                    hConf.set(MAP_VAL_CLASS, outputValClass.getName());
                    job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
                    job.setMapOutputValueClass(getOutputValClass(stageName, outputValClass));
                }
            });
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            final BatchJoiner<?, ?, ?> batchJoiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
            submitterPlugin = new SubmitterPlugin<>(stageName, context, batchJoiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {

                @Override
                public void act(DefaultJoinerContext joinerContext) {
                    if (joinerContext.getNumPartitions() != null) {
                        job.setNumReduceTasks(joinerContext.getNumPartitions());
                    }
                    Class<?> outputKeyClass = joinerContext.getJoinKeyClass();
                    Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
                    if (outputKeyClass == null) {
                        outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
                    }
                    if (inputRecordClass == null) {
                        inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
                    }
                    hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                    hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
                    job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
                    getOutputValClass(stageName, inputRecordClass);
                    // for joiner plugin map output is tagged with stageName
                    job.setMapOutputValueClass(TaggedWritable.class);
                }
            });
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
    hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
    finisher = new CompositeFinisher(finishers);
    job.setMapperClass(ETLMapper.class);
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
    }
    // token is null when just the mapreduce job is run but not the entire workflow
    // we still want things to work in that case.
    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(pipelineRuntime.getArguments().asMap()));
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) SubmitterPlugin(co.cask.cdap.etl.common.submit.SubmitterPlugin) Finisher(co.cask.cdap.etl.common.submit.Finisher) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) Job(org.apache.hadoop.mapreduce.Job) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) ContextProvider(co.cask.cdap.etl.common.submit.ContextProvider) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) Map(java.util.Map) HashMap(java.util.HashMap) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) MultiConnectorFactory(co.cask.cdap.etl.batch.connector.MultiConnectorFactory) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Example 3 with BatchPhaseSpec

use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class PipelineCondition method apply.

@Override
public boolean apply(@Nullable WorkflowContext input) {
    if (input == null) {
        // should not happen
        throw new IllegalStateException("WorkflowContext for the Condition cannot be null.");
    }
    Map<String, String> properties = input.getConditionSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    StageSpec stageSpec = phase.iterator().next();
    PluginContext pluginContext = new PipelinePluginContext(input, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(input.getToken(), input.getRuntimeArguments()), input.getLogicalStartTime(), input, input.getNamespace());
    try {
        Condition condition = pluginContext.newPluginInstance(stageSpec.getName(), macroEvaluator);
        PipelineRuntime pipelineRuntime = new PipelineRuntime(input, metrics);
        ConditionContext conditionContext = new BasicConditionContext(input, pipelineRuntime, stageSpec);
        boolean result = condition.apply(conditionContext);
        WorkflowToken token = input.getToken();
        if (token == null) {
            throw new IllegalStateException("WorkflowToken cannot be null when Condition is executed through Workflow.");
        }
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
        return result;
    } catch (Exception e) {
        String msg = String.format("Error executing condition '%s' in the pipeline.", stageSpec.getName());
        throw new RuntimeException(msg, e);
    }
}
Also used : Condition(co.cask.cdap.etl.api.condition.Condition) AbstractCondition(co.cask.cdap.api.workflow.AbstractCondition) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) PluginContext(co.cask.cdap.api.plugin.PluginContext) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) ConditionContext(co.cask.cdap.etl.api.condition.ConditionContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) BasicArguments(co.cask.cdap.etl.common.BasicArguments) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext)

Example 4 with BatchPhaseSpec

use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class PipelineAction method run.

@Override
public void run() throws Exception {
    CustomActionContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    StageSpec stageSpec = phase.iterator().next();
    PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
    Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace()));
    ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
    if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
        action.run(actionContext);
    }
    WorkflowToken token = context.getWorkflowToken();
    if (token == null) {
        throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
    }
    for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
        token.put(entry.getKey(), entry.getValue());
    }
}
Also used : Action(co.cask.cdap.etl.api.action.Action) CustomAction(co.cask.cdap.api.customaction.CustomAction) AbstractCustomAction(co.cask.cdap.api.customaction.AbstractCustomAction) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) CustomActionContext(co.cask.cdap.api.customaction.CustomActionContext) ActionContext(co.cask.cdap.etl.api.action.ActionContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) CustomActionContext(co.cask.cdap.api.customaction.CustomActionContext) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext)

Example 5 with BatchPhaseSpec

use of co.cask.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    final SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    List<Finisher> finishers = new ArrayList<>();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
    final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    final Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
    final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    final Admin admin = context.getAdmin();
    PipelinePhase phase = phaseSpec.getPhase();
    // go through in topological order so that arguments set by one stage are seen by stages after it
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
        final StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin = null;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {

                @Override
                public BatchSourceContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
            Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {

                @Override
                public StageSubmitterContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {

                @Override
                public BatchSinkContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {

                @Override
                public SparkPluginContext getContext(DatasetContext datasetContext) {
                    return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {

                @Override
                public void act(DefaultJoinerContext sparkJoinerContext) {
                    stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
                }
            });
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = new CompositeFinisher(finishers);
    context.localize("HydratorSpark.config", configFile.toURI());
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
    }
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SingleConnectorFactory(co.cask.cdap.etl.batch.connector.SingleConnectorFactory) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) SubmitterPlugin(co.cask.cdap.etl.common.submit.SubmitterPlugin) Finisher(co.cask.cdap.etl.common.submit.Finisher) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) StageSubmitterContext(co.cask.cdap.etl.api.StageSubmitterContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(co.cask.cdap.etl.common.BasicArguments) DatasetContext(co.cask.cdap.api.data.DatasetContext) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) ContextProvider(co.cask.cdap.etl.common.submit.ContextProvider) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) Map(java.util.Map) HashMap(java.util.HashMap) File(java.io.File) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) Admin(co.cask.cdap.api.Admin) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Transform(co.cask.cdap.etl.api.Transform) SparkConf(org.apache.spark.SparkConf) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) Writer(java.io.Writer) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Aggregations

BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)8 StageSpec (co.cask.cdap.etl.spec.StageSpec)7 HashMap (java.util.HashMap)6 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)5 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)5 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)4 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)4 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)4 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)4 PluginContext (co.cask.cdap.api.plugin.PluginContext)3 PipelinePluginInstantiator (co.cask.cdap.etl.batch.PipelinePluginInstantiator)3 BasicArguments (co.cask.cdap.etl.common.BasicArguments)3 Map (java.util.Map)3 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)2 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)2 BatchConfigurable (co.cask.cdap.etl.api.batch.BatchConfigurable)2 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)2 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)2 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)2 SingleConnectorFactory (co.cask.cdap.etl.batch.connector.SingleConnectorFactory)2