Search in sources :

Example 1 with PipelinePluginContext

use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.

the class BatchSparkPipelineDriver method run.

@Override
public void run(DatasetContext context) throws Exception {
    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
    try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
        String object = reader.readLine();
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
        sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
        sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
        stagePartitions = sourceSinkInfo.getStagePartitions();
    }
    datasetContext = context;
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    Map<String, StageStatisticsCollector> collectors = new HashMap<>();
    if (phaseSpec.pipelineContainsCondition()) {
        Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
        while (iterator.hasNext()) {
            StageSpec spec = iterator.next();
            collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
        }
    }
    boolean isSuccessful = true;
    try {
        PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
        boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
        boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
        boolean isPreviewEnabled = phaseSpec.getPhase().size() == 0 || sec.getDataTracer(phaseSpec.getPhase().iterator().next().getName()).isEnabled();
        // Initialize SQL engine instance if needed.
        if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null) {
            String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
            // Instantiate SQL engine and prepare run.
            try {
                MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
                Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
                sqlEngineAdapter = new BatchSQLEngineAdapter((SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
                sqlEngineAdapter.prepareRun();
            } catch (InstantiationException ie) {
                LOG.error("Could not create plugin instance for SQLEngine class", ie);
            } finally {
                if (sqlEngineAdapter == null) {
                    LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
                }
            }
        }
        runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
    } catch (Throwable t) {
        // Mark this execution as not successful.
        isSuccessful = false;
        // Rethrow
        throw t;
    } finally {
        updateWorkflowToken(sec.getWorkflowToken(), collectors);
        // Close SQL Engine Adapter if neeeded,
        if (sqlEngineAdapter != null) {
            sqlEngineAdapter.onRunFinish(isSuccessful);
            sqlEngineAdapter.close();
        }
    }
}
Also used : Path(java.nio.file.Path) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) HashMap(java.util.HashMap) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) BufferedReader(java.io.BufferedReader) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 2 with PipelinePluginContext

use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.

the class PipelineAction method run.

@Override
public void run() throws Exception {
    CustomActionContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    StageSpec stageSpec = phase.iterator().next();
    PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
    Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace()));
    ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
    if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
        action.run(actionContext);
    }
    WorkflowToken token = context.getWorkflowToken();
    if (token == null) {
        throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
    }
    for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
        token.put(entry.getKey(), entry.getValue());
    }
}
Also used : Action(io.cdap.cdap.etl.api.action.Action) AbstractCustomAction(io.cdap.cdap.api.customaction.AbstractCustomAction) CustomAction(io.cdap.cdap.api.customaction.CustomAction) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) PluginContext(io.cdap.cdap.api.plugin.PluginContext) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) ActionContext(io.cdap.cdap.etl.api.action.ActionContext) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 3 with PipelinePluginContext

use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.

the class DataStreamsSparkLauncher method initialize.

@TransactionPolicy(TransactionControl.EXPLICIT)
@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    DataStreamsPipelineSpec spec = GSON.fromJson(context.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), true, true);
    int numSources = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (StreamingSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            StreamingSource<Object> streamingSource = pluginContext.newPluginInstance(stageSpec.getName());
            numSources = numSources + streamingSource.getRequiredExecutors();
        }
    }
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.streaming.backpressure.enabled", "true");
    sparkConf.set("spark.spark.streaming.blockInterval", String.valueOf(spec.getBatchIntervalMillis() / 5));
    sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
    // spark... makes you set this to at least the number of receivers (streaming sources)
    // because it holds one thread per receiver, or one core in distributed mode.
    // so... we have to set this hacky master variable based on the isUnitTest setting in the config
    String extraOpts = spec.getExtraJavaOpts();
    if (extraOpts != null && !extraOpts.isEmpty()) {
        sparkConf.set("spark.driver.extraJavaOptions", extraOpts);
        sparkConf.set("spark.executor.extraJavaOptions", extraOpts);
    }
    // without this, stopping will hang on machines with few cores.
    sparkConf.set("spark.rpc.netty.dispatcher.numThreads", String.valueOf(numSources + 2));
    sparkConf.setMaster(String.format("local[%d]", numSources + 2));
    sparkConf.set("spark.executor.instances", String.valueOf(numSources + 2));
    if (spec.isUnitTest()) {
        sparkConf.setMaster(String.format("local[%d]", numSources + 1));
    }
    // override defaults with any user provided engine configs
    int minExecutors = numSources + 1;
    for (Map.Entry<String, String> property : spec.getProperties().entrySet()) {
        if ("spark.executor.instances".equals(property.getKey())) {
            // don't let the user set this to something that doesn't make sense
            try {
                int numExecutors = Integer.parseInt(property.getValue());
                if (numExecutors < minExecutors) {
                    LOG.warn("Number of executors {} is less than the minimum number required to run the pipeline. " + "Automatically increasing it to {}", numExecutors, minExecutors);
                    numExecutors = minExecutors;
                }
                sparkConf.set(property.getKey(), String.valueOf(numExecutors));
            } catch (NumberFormatException e) {
                throw new IllegalArgumentException("Number of spark executors was set to invalid value " + property.getValue(), e);
            }
        } else {
            sparkConf.set(property.getKey(), property.getValue());
        }
    }
    context.setSparkConf(sparkConf);
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 4 with PipelinePluginContext

use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.

the class SparkStreamingPipelineDriver method run.

private JavaStreamingContext run(DataStreamsPipelineSpec pipelineSpec, PipelinePhase pipelinePhase, JavaSparkExecutionContext sec, @Nullable String checkpointDir, @Nullable JavaSparkContext context) throws Exception {
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
    PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
    SparkStreamingPreparer preparer = new SparkStreamingPreparer(pluginContext, sec.getMetrics(), evaluator, pipelineRuntime, sec);
    try {
        SparkFieldLineageRecorder recorder = new SparkFieldLineageRecorder(sec, pipelinePhase, pipelineSpec, preparer);
        recorder.record();
    } catch (Exception e) {
        LOG.warn("Failed to emit field lineage operations for streaming pipeline", e);
    }
    Set<String> uncombinableSinks = preparer.getUncombinableSinks();
    // the content in the function might not run due to spark checkpointing, currently just have the lineage logic
    // before anything is run
    Function0<JavaStreamingContext> contextFunction = (Function0<JavaStreamingContext>) () -> {
        JavaSparkContext javaSparkContext = context == null ? new JavaSparkContext() : context;
        JavaStreamingContext jssc = new JavaStreamingContext(javaSparkContext, Durations.milliseconds(pipelineSpec.getBatchIntervalMillis()));
        SparkStreamingPipelineRunner runner = new SparkStreamingPipelineRunner(sec, jssc, pipelineSpec, pipelineSpec.isCheckpointsDisabled());
        // Seems like they should be set at configure time instead of runtime? but that requires an API change.
        try {
            PhaseSpec phaseSpec = new PhaseSpec(sec.getApplicationSpecification().getName(), pipelinePhase, Collections.emptyMap(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
            boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
            boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
            runner.runPipeline(phaseSpec, StreamingSource.PLUGIN_TYPE, sec, Collections.emptyMap(), pluginContext, Collections.emptyMap(), uncombinableSinks, shouldConsolidateStages, shouldCacheFunctions);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        if (checkpointDir != null) {
            jssc.checkpoint(checkpointDir);
            jssc.sparkContext().hadoopConfiguration().set("fs.defaultFS", checkpointDir);
        }
        return jssc;
    };
    return checkpointDir == null ? contextFunction.call() : JavaStreamingContext.getOrCreate(checkpointDir, contextFunction, context.hadoopConfiguration());
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) SparkStreamingPreparer(io.cdap.cdap.etl.spark.streaming.SparkStreamingPreparer) Function0(org.apache.spark.api.java.function.Function0) IOException(java.io.IOException) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PhaseSpec(io.cdap.cdap.etl.common.PhaseSpec) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 5 with PipelinePluginContext

use of io.cdap.cdap.etl.common.plugin.PipelinePluginContext in project cdap by caskdata.

the class JavaSparkMainWrapper method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    String stageName = sec.getSpecification().getProperty(ExternalSparkProgram.STAGE_NAME);
    BatchPhaseSpec batchPhaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), batchPhaseSpec.isStageLoggingEnabled(), batchPhaseSpec.isProcessTimingEnabled());
    Class<?> mainClass = pluginContext.loadPluginClass(stageName);
    // if it's a CDAP JavaSparkMain, instantiate it and call the run method
    if (JavaSparkMain.class.isAssignableFrom(mainClass)) {
        MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
        JavaSparkMain javaSparkMain = pluginContext.newPluginInstance(stageName, macroEvaluator);
        javaSparkMain.run(sec);
    } else {
        // otherwise, assume there is a 'main' method and call it
        String programArgs = getProgramArgs(sec, stageName);
        String[] args = programArgs == null ? RuntimeArguments.toPosixArray(sec.getRuntimeArguments()) : programArgs.split(" ");
        final Method mainMethod = mainClass.getMethod("main", String[].class);
        final Object[] methodArgs = new Object[1];
        methodArgs[0] = args;
        Caller caller = pluginContext.getCaller(stageName);
        caller.call(new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                mainMethod.invoke(null, methodArgs);
                return null;
            }
        });
    }
}
Also used : DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) Method(java.lang.reflect.Method) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) Caller(io.cdap.cdap.etl.common.plugin.Caller) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) JavaSparkMain(io.cdap.cdap.api.spark.JavaSparkMain) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext)

Aggregations

PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)7 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)6 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)5 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)5 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)4 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)4 HashMap (java.util.HashMap)4 PluginContext (io.cdap.cdap.api.plugin.PluginContext)3 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)3 Map (java.util.Map)3 WorkflowToken (io.cdap.cdap.api.workflow.WorkflowToken)2 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)2 SparkPipelinePluginContext (io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext)2 TransactionPolicy (io.cdap.cdap.api.annotation.TransactionPolicy)1 AbstractCustomAction (io.cdap.cdap.api.customaction.AbstractCustomAction)1 CustomAction (io.cdap.cdap.api.customaction.CustomAction)1 CustomActionContext (io.cdap.cdap.api.customaction.CustomActionContext)1 TriggeringScheduleInfo (io.cdap.cdap.api.schedule.TriggeringScheduleInfo)1 JavaSparkMain (io.cdap.cdap.api.spark.JavaSparkMain)1 SparkClientContext (io.cdap.cdap.api.spark.SparkClientContext)1