Search in sources :

Example 16 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class MultiSinkFunction method initializeBranchExecutors.

private void initializeBranchExecutors() {
    emitter = new DefaultEmitter<>();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pipelineRuntime.getPluginContext(), pipelineRuntime.getMetrics(), phaseSpec, new SingleConnectorFactory());
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), pipelineRuntime.getLogicalStartTime(), pipelineRuntime.getSecureStore(), pipelineRuntime.getServiceDiscoverer(), pipelineRuntime.getNamespace());
    executorFactory = new SparkTransformExecutorFactory(pluginInstantiator, macroEvaluator, null, collectors, dataTracers, pipelineRuntime, emitter);
    /*
       If the dag is:

            |--> t1 --> k1
       s1 --|
            |--> k2
                 ^
           s2 ---|

       the group is t1, k1, and k2.
     */
    PipelinePhase pipelinePhase = phaseSpec.getPhase();
    branchExecutors = new HashMap<>();
    inputConnections = new HashMap<>();
    for (String groupSource : group) {
        // group "sources" are stages in the group that don't have an input from another stage in the group.
        if (Sets.difference(pipelinePhase.getStageInputs(groupSource), group).isEmpty()) {
            continue;
        }
        // get the branch by taking a subset of the pipeline starting from the "source".
        // with the example above, the two branches are t1 -> k1, and k2.
        PipelinePhase branch;
        if (pipelinePhase.getSinks().contains(groupSource)) {
            // pipelinePhase.subsetFrom() throws an exception if the new "source" is also a sink,
            // since a Dag cannot be a single node. so build it manually.
            branch = PipelinePhase.builder(pipelinePhase.getPluginTypes()).addStage(pipelinePhase.getStage(groupSource)).build();
        } else {
            branch = pipelinePhase.subsetFrom(Collections.singleton(groupSource));
        }
        try {
            branchExecutors.put(groupSource, executorFactory.create(branch));
        } catch (Exception e) {
            throw new IllegalStateException(String.format("Unable to get subset of pipeline starting from stage %s. " + "This indicates a planning error. Please report this bug and turn off stage " + "consolidation by setting %s to false in the runtime arguments.", groupSource, Constants.CONSOLIDATE_STAGES), e);
        }
        /*
          create a mapping from possible inputs to "group sources". This will help identify which incoming
          records should be sent to which branch executor.

          for example, the pipeline may look like:

                           |port a --> k1
             s --> split --|
                           |port b --> k2

          In this scenario, k1, and k2, are all in the same group, so the map contains:

            { stageName: split, port: a, type: output } -> [k1]
            { stageName: split, port: b, type: output } -> [k2]

          A slightly more complicated example:

                               |--> k1
            s1 --> transform --|
                      |        |--> k2
                      |
                      |--> error collector --> k3

          In this scenario, k1, k2, k3, and error collector are in the same group, so the map contains:

            { stageName: transform, type: output } -> [k1, k2]
            { stageName: transform, type: error } -> [k3]
       */
        String groupSourceType = pipelinePhase.getStage(groupSource).getPluginType();
        RecordType recordType = ErrorTransform.PLUGIN_TYPE.equals(groupSourceType) ? RecordType.ERROR : RecordType.OUTPUT;
        for (String inputStage : pipelinePhase.getStageInputs(groupSource)) {
            Map<String, StageSpec.Port> ports = pipelinePhase.getStage(inputStage).getOutputPorts();
            String port = ports.get(groupSource).getPort();
            InputInfo inputInfo = new InputInfo(inputStage, recordType, port);
            Set<String> groupSources = inputConnections.computeIfAbsent(inputInfo, key -> new HashSet<>());
            groupSources.add(groupSource);
        }
    }
}
Also used : DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkTransformExecutorFactory(io.cdap.cdap.etl.spark.SparkTransformExecutorFactory) RecordType(io.cdap.cdap.etl.common.RecordType) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator)

Example 17 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class SparkStreamingPipelineDriver method run.

private JavaStreamingContext run(DataStreamsPipelineSpec pipelineSpec, PipelinePhase pipelinePhase, JavaSparkExecutionContext sec, @Nullable String checkpointDir, @Nullable JavaSparkContext context) throws Exception {
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
    PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
    SparkStreamingPreparer preparer = new SparkStreamingPreparer(pluginContext, sec.getMetrics(), evaluator, pipelineRuntime, sec);
    try {
        SparkFieldLineageRecorder recorder = new SparkFieldLineageRecorder(sec, pipelinePhase, pipelineSpec, preparer);
        recorder.record();
    } catch (Exception e) {
        LOG.warn("Failed to emit field lineage operations for streaming pipeline", e);
    }
    Set<String> uncombinableSinks = preparer.getUncombinableSinks();
    // the content in the function might not run due to spark checkpointing, currently just have the lineage logic
    // before anything is run
    Function0<JavaStreamingContext> contextFunction = (Function0<JavaStreamingContext>) () -> {
        JavaSparkContext javaSparkContext = context == null ? new JavaSparkContext() : context;
        JavaStreamingContext jssc = new JavaStreamingContext(javaSparkContext, Durations.milliseconds(pipelineSpec.getBatchIntervalMillis()));
        SparkStreamingPipelineRunner runner = new SparkStreamingPipelineRunner(sec, jssc, pipelineSpec, pipelineSpec.isCheckpointsDisabled());
        // Seems like they should be set at configure time instead of runtime? but that requires an API change.
        try {
            PhaseSpec phaseSpec = new PhaseSpec(sec.getApplicationSpecification().getName(), pipelinePhase, Collections.emptyMap(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
            boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
            boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
            runner.runPipeline(phaseSpec, StreamingSource.PLUGIN_TYPE, sec, Collections.emptyMap(), pluginContext, Collections.emptyMap(), uncombinableSinks, shouldConsolidateStages, shouldCacheFunctions);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        if (checkpointDir != null) {
            jssc.checkpoint(checkpointDir);
            jssc.sparkContext().hadoopConfiguration().set("fs.defaultFS", checkpointDir);
        }
        return jssc;
    };
    return checkpointDir == null ? contextFunction.call() : JavaStreamingContext.getOrCreate(checkpointDir, contextFunction, context.hadoopConfiguration());
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) SparkStreamingPreparer(io.cdap.cdap.etl.spark.streaming.SparkStreamingPreparer) Function0(org.apache.spark.api.java.function.Function0) IOException(java.io.IOException) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PhaseSpec(io.cdap.cdap.etl.common.PhaseSpec) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 18 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class PipelineAction method run.

@Override
public void run() throws Exception {
    CustomActionContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    StageSpec stageSpec = phase.iterator().next();
    PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
    Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace()));
    ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
    if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
        action.run(actionContext);
    }
    WorkflowToken token = context.getWorkflowToken();
    if (token == null) {
        throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
    }
    for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
        token.put(entry.getKey(), entry.getValue());
    }
}
Also used : Action(io.cdap.cdap.etl.api.action.Action) AbstractCustomAction(io.cdap.cdap.api.customaction.AbstractCustomAction) CustomAction(io.cdap.cdap.api.customaction.CustomAction) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) PluginContext(io.cdap.cdap.api.plugin.PluginContext) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) ActionContext(io.cdap.cdap.etl.api.action.ActionContext) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 19 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class ETLMapReduce method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    // should never happen if planner is correct
    Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
        Iterator<StageSpec> reducerIter = reducers.iterator();
        StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
        while (reducerIter.hasNext()) {
            reducersStr.append(",");
            reducersStr.append(reducerIter.next().getName());
        }
        throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
    }
    job.setMapperClass(ETLMapper.class);
    if (reducers.isEmpty()) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(ETLReducer.class);
    }
    // instantiate plugins and call their prepare methods
    Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    MapReducePreparer preparer = new MapReducePreparer(context, mrMetrics, evaluator, pipelineRuntime, connectorDatasets);
    List<Finisher> finishers = preparer.prepare(phaseSpec, job);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) Configuration(org.apache.hadoop.conf.Configuration) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) MapReduceContext(io.cdap.cdap.api.mapreduce.MapReduceContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) Job(org.apache.hadoop.mapreduce.Job) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 20 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class JavaSparkMainWrapper method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    String stageName = sec.getSpecification().getProperty(ExternalSparkProgram.STAGE_NAME);
    BatchPhaseSpec batchPhaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), batchPhaseSpec.isStageLoggingEnabled(), batchPhaseSpec.isProcessTimingEnabled());
    Class<?> mainClass = pluginContext.loadPluginClass(stageName);
    // if it's a CDAP JavaSparkMain, instantiate it and call the run method
    if (JavaSparkMain.class.isAssignableFrom(mainClass)) {
        MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
        JavaSparkMain javaSparkMain = pluginContext.newPluginInstance(stageName, macroEvaluator);
        javaSparkMain.run(sec);
    } else {
        // otherwise, assume there is a 'main' method and call it
        String programArgs = getProgramArgs(sec, stageName);
        String[] args = programArgs == null ? RuntimeArguments.toPosixArray(sec.getRuntimeArguments()) : programArgs.split(" ");
        final Method mainMethod = mainClass.getMethod("main", String[].class);
        final Object[] methodArgs = new Object[1];
        methodArgs[0] = args;
        Caller caller = pluginContext.getCaller(stageName);
        caller.call(new Callable<Void>() {

            @Override
            public Void call() throws Exception {
                mainMethod.invoke(null, methodArgs);
                return null;
            }
        });
    }
}
Also used : DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) Method(java.lang.reflect.Method) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) Caller(io.cdap.cdap.etl.common.plugin.Caller) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) JavaSparkMain(io.cdap.cdap.api.spark.JavaSparkMain) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext)

Aggregations

DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)40 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)38 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)26 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)22 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)20 HashMap (java.util.HashMap)16 Map (java.util.Map)16 PluginContext (io.cdap.cdap.api.plugin.PluginContext)14 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)14 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)12 SparkPipelineRuntime (io.cdap.cdap.etl.spark.SparkPipelineRuntime)10 SparkPipelinePluginContext (io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext)10 MacroParserOptions (io.cdap.cdap.api.macro.MacroParserOptions)8 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)8 AlertPublisher (io.cdap.cdap.etl.api.AlertPublisher)6 OAuthMacroEvaluator (io.cdap.cdap.etl.common.OAuthMacroEvaluator)6 SecureStoreMacroEvaluator (io.cdap.cdap.etl.common.SecureStoreMacroEvaluator)6 TxRunnable (io.cdap.cdap.api.TxRunnable)5 DatasetContext (io.cdap.cdap.api.data.DatasetContext)5 ArrayList (java.util.ArrayList)5