Search in sources :

Example 6 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    for (StageInfo stageInfo : phaseSpec.getPhase()) {
        String stageName = stageInfo.getName();
        String pluginType = stageInfo.getPluginType();
        if (BatchSource.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSourceContext sourceContext = new SparkBatchSourceContext(sourceFactory, context, stageInfo);
            batchSource.prepareRun(sourceContext);
            finishers.add(batchSource, sourceContext);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, stageInfo);
            batchSink.prepareRun(sinkContext);
            finishers.add(batchSink, sinkContext);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
            SparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, stageInfo);
            sparkSink.prepareRun(sparkPluginContext);
            finishers.add(sparkSink, sparkPluginContext);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, stageInfo);
            aggregator.prepareRun(aggregatorContext);
            finishers.add(aggregator, aggregatorContext);
            stagePartitions.put(stageName, aggregatorContext.getNumPartitions());
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultJoinerContext sparkJoinerContext = new DefaultJoinerContext(context, stageInfo);
            joiner.prepareRun(sparkJoinerContext);
            finishers.add(joiner, sparkJoinerContext);
            stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = finishers.build();
    context.localize("HydratorSpark.config", configFile.toURI());
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.CompositeFinisher) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) File(java.io.File) Writer(java.io.Writer)

Example 7 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class ETLWorker method initializeTransforms.

private void initializeTransforms(WorkerContext context, Map<String, TransformDetail> transformDetailMap, PipelinePhase pipeline) throws Exception {
    Set<StageInfo> transformInfos = pipeline.getStagesOfType(Transform.PLUGIN_TYPE);
    Preconditions.checkArgument(transformInfos != null);
    tranformIdToDatasetName = new HashMap<>(transformInfos.size());
    for (StageInfo transformInfo : transformInfos) {
        String transformName = transformInfo.getName();
        try {
            Transform<?, ?> transform = context.newPluginInstance(transformName);
            ;
            transform = new WrappedTransform<>(transform, Caller.DEFAULT);
            WorkerRealtimeContext transformContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), transformInfo);
            LOG.debug("Transform Class : {}", transform.getClass().getName());
            transform.initialize(transformContext);
            StageMetrics stageMetrics = new DefaultStageMetrics(metrics, transformName);
            transformDetailMap.put(transformName, new TransformDetail(new TrackedTransform<>(transform, stageMetrics, context.getDataTracer(transformName)), pipeline.getStageOutputs(transformName)));
            if (transformInfo.getErrorDatasetName() != null) {
                tranformIdToDatasetName.put(transformName, transformInfo.getErrorDatasetName());
            }
        } catch (InstantiationException e) {
            LOG.error("Unable to instantiate Transform", e);
            Throwables.propagate(e);
        }
    }
}
Also used : TrackedTransform(co.cask.cdap.etl.common.TrackedTransform) StageInfo(co.cask.cdap.etl.planner.StageInfo) TxLookupProvider(co.cask.cdap.etl.common.TxLookupProvider) TransformDetail(co.cask.cdap.etl.common.TransformDetail) StageMetrics(co.cask.cdap.etl.api.StageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics)

Example 8 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class ETLWorker method initializeSinks.

@SuppressWarnings("unchecked")
private void initializeSinks(WorkerContext context, Map<String, TransformDetail> transformationMap, PipelinePhase pipeline) throws Exception {
    Set<StageInfo> sinkInfos = pipeline.getStagesOfType(RealtimeSink.PLUGIN_TYPE);
    sinks = new HashMap<>(sinkInfos.size());
    for (StageInfo sinkInfo : sinkInfos) {
        String sinkName = sinkInfo.getName();
        RealtimeSink sink = context.newPluginInstance(sinkName);
        sink = new LoggedRealtimeSink(sinkName, sink);
        WorkerRealtimeContext sinkContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), sinkInfo);
        LOG.debug("Sink Class : {}", sink.getClass().getName());
        sink.initialize(sinkContext);
        sink = new TrackedRealtimeSink(sink, new DefaultStageMetrics(metrics, sinkName));
        Transformation identityTransformation = new Transformation() {

            @Override
            public void transform(Object input, Emitter emitter) throws Exception {
                emitter.emit(input);
            }
        };
        // we use identity transformation to simplify executing transformation in pipeline (similar to ETLMapreduce),
        // since we want to emit metrics during write to sink and not during this transformation, we use NoOpMetrics.
        TrackedTransform trackedTransform = new TrackedTransform(identityTransformation, new DefaultStageMetrics(metrics, sinkName), TrackedTransform.RECORDS_IN, null, context.getDataTracer(sinkName));
        transformationMap.put(sinkInfo.getName(), new TransformDetail(trackedTransform, new HashSet<String>()));
        sinks.put(sinkInfo.getName(), sink);
    }
}
Also used : TrackedTransform(co.cask.cdap.etl.common.TrackedTransform) Transformation(co.cask.cdap.etl.api.Transformation) Emitter(co.cask.cdap.etl.api.Emitter) DefaultEmitter(co.cask.cdap.etl.common.DefaultEmitter) TrackedEmitter(co.cask.cdap.etl.common.TrackedEmitter) StageInfo(co.cask.cdap.etl.planner.StageInfo) TxLookupProvider(co.cask.cdap.etl.common.TxLookupProvider) TransformDetail(co.cask.cdap.etl.common.TransformDetail) RealtimeSink(co.cask.cdap.etl.api.realtime.RealtimeSink) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) HashSet(java.util.HashSet)

Example 9 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class DynamicSparkCompute method lazyInit.

// when checkpointing is enabled, and Spark is loading DStream operations from an existing checkpoint,
// delegate will be null and the initialize() method won't have been called. So we need to instantiate
// the delegate and initialize it.
private void lazyInit(final JavaSparkContext jsc) throws Exception {
    if (delegate == null) {
        PluginFunctionContext pluginFunctionContext = dynamicDriverContext.getPluginFunctionContext();
        delegate = pluginFunctionContext.createPlugin();
        final StageInfo stageInfo = pluginFunctionContext.getStageInfo();
        final JavaSparkExecutionContext sec = dynamicDriverContext.getSparkExecutionContext();
        Transactionals.execute(sec, new TxRunnable() {

            @Override
            public void run(DatasetContext datasetContext) throws Exception {
                SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, stageInfo);
                delegate.initialize(sparkPluginContext);
            }
        }, Exception.class);
    }
}
Also used : BasicSparkExecutionPluginContext(co.cask.cdap.etl.spark.batch.BasicSparkExecutionPluginContext) PluginFunctionContext(co.cask.cdap.etl.spark.function.PluginFunctionContext) SparkExecutionPluginContext(co.cask.cdap.etl.api.batch.SparkExecutionPluginContext) BasicSparkExecutionPluginContext(co.cask.cdap.etl.spark.batch.BasicSparkExecutionPluginContext) StageInfo(co.cask.cdap.etl.planner.StageInfo) TxRunnable(co.cask.cdap.api.TxRunnable) JavaSparkExecutionContext(co.cask.cdap.api.spark.JavaSparkExecutionContext) DatasetContext(co.cask.cdap.api.data.DatasetContext) TransactionFailureException(org.apache.tephra.TransactionFailureException)

Example 10 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class TransformExecutorFactory method create.

/**
   * Create a transform executor for the specified pipeline. Will instantiate and initialize all sources,
   * transforms, and sinks in the pipeline.
   *
   * @param pipeline the pipeline to create a transform executor for
   * @return executor for the pipeline
   * @throws InstantiationException if there was an error instantiating a plugin
   * @throws Exception              if there was an error initializing a plugin
   */
public <KEY_OUT, VAL_OUT> PipeTransformExecutor<T> create(PipelinePhase pipeline, OutputWriter<KEY_OUT, VAL_OUT> outputWriter, Map<String, ErrorOutputWriter<Object, Object>> transformErrorSinkMap) throws Exception {
    Map<String, PipeTransformDetail> transformations = new HashMap<>();
    Set<String> sources = pipeline.getSources();
    // Set input and output schema for this stage
    for (String pluginType : pipeline.getPluginTypes()) {
        for (StageInfo stageInfo : pipeline.getStagesOfType(pluginType)) {
            String stageName = stageInfo.getName();
            outputSchemas.put(stageName, stageInfo.getOutputSchema());
            perStageInputSchemas.put(stageName, stageInfo.getInputSchemas());
        }
    }
    // recursively set PipeTransformDetail for all the stages
    for (String source : sources) {
        setPipeTransformDetail(pipeline, source, transformations, transformErrorSinkMap, outputWriter);
    }
    // sourceStageName will be null in reducers, so need to handle that case
    Set<String> startingPoints = (sourceStageName == null) ? pipeline.getSources() : Sets.newHashSet(sourceStageName);
    return new PipeTransformExecutor<>(transformations, startingPoints);
}
Also used : HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) PipeTransformExecutor(co.cask.cdap.etl.batch.mapreduce.PipeTransformExecutor)

Aggregations

StageInfo (co.cask.cdap.etl.planner.StageInfo)14 HashMap (java.util.HashMap)8 Map (java.util.Map)6 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)4 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)3 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)3 BasicArguments (co.cask.cdap.etl.common.BasicArguments)3 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)3 PluginContext (co.cask.cdap.api.plugin.PluginContext)2 WorkflowContext (co.cask.cdap.api.workflow.WorkflowContext)2 BatchActionContext (co.cask.cdap.etl.api.batch.BatchActionContext)2 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)2 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)2 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)2 PostAction (co.cask.cdap.etl.api.batch.PostAction)2 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)2 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)2 CompositeFinisher (co.cask.cdap.etl.common.CompositeFinisher)2 DefaultStageMetrics (co.cask.cdap.etl.common.DefaultStageMetrics)2 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)2