Search in sources :

Example 6 with MacroEvaluator

use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.

the class ETLWorkflow method initialize.

@Override
public void initialize(WorkflowContext context) throws Exception {
    super.initialize(context);
    postActions = new LinkedHashMap<>();
    BatchPipelineSpec batchPipelineSpec = GSON.fromJson(context.getWorkflowSpecification().getProperty("pipeline.spec"), BatchPipelineSpec.class);
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(context.getToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    for (ActionSpec actionSpec : batchPipelineSpec.getEndingActions()) {
        postActions.put(actionSpec.getName(), (PostAction) context.newPluginInstance(actionSpec.getName(), macroEvaluator));
    }
}
Also used : MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator)

Example 7 with MacroEvaluator

use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    for (StageInfo stageInfo : phaseSpec.getPhase()) {
        String stageName = stageInfo.getName();
        String pluginType = stageInfo.getPluginType();
        if (BatchSource.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSourceContext sourceContext = new SparkBatchSourceContext(sourceFactory, context, stageInfo);
            batchSource.prepareRun(sourceContext);
            finishers.add(batchSource, sourceContext);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, stageInfo);
            batchSink.prepareRun(sinkContext);
            finishers.add(batchSink, sinkContext);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
            SparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, stageInfo);
            sparkSink.prepareRun(sparkPluginContext);
            finishers.add(sparkSink, sparkPluginContext);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, stageInfo);
            aggregator.prepareRun(aggregatorContext);
            finishers.add(aggregator, aggregatorContext);
            stagePartitions.put(stageName, aggregatorContext.getNumPartitions());
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultJoinerContext sparkJoinerContext = new DefaultJoinerContext(context, stageInfo);
            joiner.prepareRun(sparkJoinerContext);
            finishers.add(joiner, sparkJoinerContext);
            stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = finishers.build();
    context.localize("HydratorSpark.config", configFile.toURI());
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.CompositeFinisher) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) File(java.io.File) Writer(java.io.Writer)

Example 8 with MacroEvaluator

use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.

the class SmartWorkflow method initialize.

@Override
public void initialize(WorkflowContext context) throws Exception {
    super.initialize(context);
    String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
    WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
    postActions = new LinkedHashMap<>();
    spec = GSON.fromJson(context.getWorkflowSpecification().getProperty(Constants.PIPELINE_SPEC_KEY), BatchPipelineSpec.class);
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(context.getToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    PluginContext pluginContext = new PipelinePluginContext(context, workflowMetrics, spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled());
    for (ActionSpec actionSpec : spec.getEndingActions()) {
        postActions.put(actionSpec.getName(), (PostAction) pluginContext.newPluginInstance(actionSpec.getName(), macroEvaluator));
    }
    WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
Also used : BatchPipelineSpec(co.cask.cdap.etl.batch.BatchPipelineSpec) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) ActionSpec(co.cask.cdap.etl.batch.ActionSpec) PluginContext(co.cask.cdap.api.plugin.PluginContext) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) PipelinePluginContext(co.cask.cdap.etl.common.plugin.PipelinePluginContext)

Example 9 with MacroEvaluator

use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.

the class SparkStreamingPipelineRunner method getSource.

@Override
protected SparkCollection<Tuple2<Boolean, Object>> getSource(StageInfo stageInfo) throws Exception {
    StreamingSource<Object> source;
    if (checkpointsDisabled) {
        PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageInfo, sec);
        source = pluginFunctionContext.createPlugin();
    } else {
        // check for macros in any StreamingSource. If checkpoints are enabled,
        // SparkStreaming will serialize all InputDStreams created in the checkpoint, which means
        // the InputDStream is deserialized directly from the checkpoint instead of instantiated through CDAP.
        // This means there isn't any way for us to perform macro evaluation on sources when they are loaded from
        // checkpoints. We can work around this in all other pipeline stages by dynamically instantiating the
        // plugin in all DStream functions, but can't for InputDStreams because the InputDStream constructor
        // adds itself to the context dag. Yay for constructors with global side effects.
        // TODO: (HYDRATOR-1030) figure out how to do this at configure time instead of run time
        MacroEvaluator macroEvaluator = new ErrorMacroEvaluator("Due to spark limitations, macro evaluation is not allowed in streaming sources when checkpointing " + "is enabled.");
        PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled());
        source = pluginContext.newPluginInstance(stageInfo.getName(), macroEvaluator);
    }
    DataTracer dataTracer = sec.getDataTracer(stageInfo.getName());
    StreamingContext sourceContext = new DefaultStreamingContext(stageInfo, sec, streamingContext);
    JavaDStream<Object> javaDStream = source.getStream(sourceContext);
    if (dataTracer.isEnabled()) {
        // it will create a new function for each RDD, which would limit each RDD but not the entire DStream.
        javaDStream = javaDStream.transform(new LimitingFunction<>(spec.getNumOfRecordsPreview()));
    }
    JavaDStream<Tuple2<Boolean, Object>> outputDStream = javaDStream.transform(new CountingTransformFunction<>(stageInfo.getName(), sec.getMetrics(), "records.out", dataTracer)).map(new WrapOutputTransformFunction<>());
    return new DStreamCollection<>(sec, outputDStream);
}
Also used : PairDStreamCollection(co.cask.cdap.etl.spark.streaming.PairDStreamCollection) DStreamCollection(co.cask.cdap.etl.spark.streaming.DStreamCollection) StreamingContext(co.cask.cdap.etl.api.streaming.StreamingContext) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) DefaultStreamingContext(co.cask.cdap.etl.spark.streaming.DefaultStreamingContext) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) CountingTransformFunction(co.cask.cdap.etl.spark.streaming.function.CountingTransformFunction) DefaultStreamingContext(co.cask.cdap.etl.spark.streaming.DefaultStreamingContext) PluginFunctionContext(co.cask.cdap.etl.spark.function.PluginFunctionContext) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) Tuple2(scala.Tuple2) DataTracer(co.cask.cdap.api.preview.DataTracer) LimitingFunction(co.cask.cdap.etl.spark.streaming.function.preview.LimitingFunction)

Aggregations

MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)9 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)8 PluginContext (co.cask.cdap.api.plugin.PluginContext)5 SparkPipelinePluginContext (co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext)5 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)3 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)3 StageInfo (co.cask.cdap.etl.planner.StageInfo)3 HashMap (java.util.HashMap)3 Map (java.util.Map)3 TxRunnable (co.cask.cdap.api.TxRunnable)2 DatasetContext (co.cask.cdap.api.data.DatasetContext)2 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)2 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)2 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)2 SparkPluginContext (co.cask.cdap.etl.api.batch.SparkPluginContext)2 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)2 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)2 CompositeFinisher (co.cask.cdap.etl.common.CompositeFinisher)2 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)2 MapReduceContext (co.cask.cdap.api.mapreduce.MapReduceContext)1