Search in sources :

Example 1 with BatchSinkContext

use of co.cask.cdap.etl.api.batch.BatchSinkContext in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    final SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    List<Finisher> finishers = new ArrayList<>();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(context), context.getLogicalStartTime(), context, context.getNamespace());
    final SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    final Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, context.getMetrics(), phaseSpec, new SingleConnectorFactory());
    final PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    final Admin admin = context.getAdmin();
    PipelinePhase phase = phaseSpec.getPhase();
    // go through in topological order so that arguments set by one stage are seen by stages after it
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
        final StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin = null;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSourceContext> contextProvider = new ContextProvider<BatchSourceContext>() {

                @Override
                public BatchSourceContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSource, contextProvider);
        } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
            Transform transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<StageSubmitterContext> contextProvider = new ContextProvider<StageSubmitterContext>() {

                @Override
                public StageSubmitterContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSourceContext(sourceFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, transform, contextProvider);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<BatchSinkContext> contextProvider = new ContextProvider<BatchSinkContext>() {

                @Override
                public BatchSinkContext getContext(DatasetContext datasetContext) {
                    return new SparkBatchSinkContext(sinkFactory, context, pipelineRuntime, datasetContext, stageSpec);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, batchSink, contextProvider);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<SparkPluginContext> contextProvider = new ContextProvider<SparkPluginContext>() {

                @Override
                public SparkPluginContext getContext(DatasetContext datasetContext) {
                    return new BasicSparkPluginContext(context, pipelineRuntime, stageSpec, datasetContext, admin);
                }
            };
            submitterPlugin = new SubmitterPlugin(stageName, context, sparkSink, contextProvider);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin(stageName, context, aggregator, contextProvider);
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, admin);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, joiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {

                @Override
                public void act(DefaultJoinerContext sparkJoinerContext) {
                    stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
                }
            });
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = new CompositeFinisher(finishers);
    context.localize("HydratorSpark.config", configFile.toURI());
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
    }
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SingleConnectorFactory(co.cask.cdap.etl.batch.connector.SingleConnectorFactory) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) SubmitterPlugin(co.cask.cdap.etl.common.submit.SubmitterPlugin) Finisher(co.cask.cdap.etl.common.submit.Finisher) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) StageSubmitterContext(co.cask.cdap.etl.api.StageSubmitterContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(co.cask.cdap.etl.common.BasicArguments) DatasetContext(co.cask.cdap.api.data.DatasetContext) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) ContextProvider(co.cask.cdap.etl.common.submit.ContextProvider) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) Map(java.util.Map) HashMap(java.util.HashMap) File(java.io.File) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) Admin(co.cask.cdap.api.Admin) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Transform(co.cask.cdap.etl.api.Transform) SparkConf(org.apache.spark.SparkConf) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) Writer(java.io.Writer) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Aggregations

Admin (co.cask.cdap.api.Admin)1 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)1 PluginContext (co.cask.cdap.api.plugin.PluginContext)1 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)1 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)1 StageSubmitterContext (co.cask.cdap.etl.api.StageSubmitterContext)1 Transform (co.cask.cdap.etl.api.Transform)1 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)1 BatchConfigurable (co.cask.cdap.etl.api.batch.BatchConfigurable)1 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)1 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)1 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)1 SparkPluginContext (co.cask.cdap.etl.api.batch.SparkPluginContext)1 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)1 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)1 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)1 PipelinePluginInstantiator (co.cask.cdap.etl.batch.PipelinePluginInstantiator)1 SingleConnectorFactory (co.cask.cdap.etl.batch.connector.SingleConnectorFactory)1