Search in sources :

Example 11 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class StreamingSparkSinkFunction method call.

@Override
public void call(JavaRDD<T> data, Time batchTime) throws Exception {
    if (data.isEmpty()) {
        return;
    }
    final long logicalStartTime = batchTime.milliseconds();
    MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(sec), logicalStartTime, sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
    final PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), stageSpec.isStageLoggingEnabled(), stageSpec.isProcessTimingEnabled());
    final PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec, batchTime.milliseconds());
    final String stageName = stageSpec.getName();
    final SparkSink<T> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
    boolean isPrepared = false;
    boolean isDone = false;
    try {
        sec.execute(new TxRunnable() {

            @Override
            public void run(DatasetContext datasetContext) throws Exception {
                SparkPluginContext context = new BasicSparkPluginContext(null, pipelineRuntime, stageSpec, datasetContext, sec.getAdmin());
                sparkSink.prepareRun(context);
            }
        });
        isPrepared = true;
        final SparkExecutionPluginContext sparkExecutionPluginContext = new SparkStreamingExecutionContext(sec, JavaSparkContext.fromSparkContext(data.rdd().context()), logicalStartTime, stageSpec);
        final JavaRDD<T> countedRDD = data.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)).cache();
        sec.execute(new TxRunnable() {

            @Override
            public void run(DatasetContext context) throws Exception {
                sparkSink.run(sparkExecutionPluginContext, countedRDD);
            }
        });
        isDone = true;
        sec.execute(new TxRunnable() {

            @Override
            public void run(DatasetContext datasetContext) throws Exception {
                SparkPluginContext context = new BasicSparkPluginContext(null, pipelineRuntime, stageSpec, datasetContext, sec.getAdmin());
                sparkSink.onRunFinish(true, context);
            }
        });
    } catch (Exception e) {
        LOG.error("Error while executing sink {} for the batch for time {}.", stageName, logicalStartTime, e);
    } finally {
        if (isPrepared && !isDone) {
            sec.execute(new TxRunnable() {

                @Override
                public void run(DatasetContext datasetContext) throws Exception {
                    SparkPluginContext context = new BasicSparkPluginContext(null, pipelineRuntime, stageSpec, datasetContext, sec.getAdmin());
                    sparkSink.onRunFinish(false, context);
                }
            });
        }
    }
}
Also used : DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(io.cdap.cdap.api.plugin.PluginContext) BasicSparkPluginContext(io.cdap.cdap.etl.spark.batch.BasicSparkPluginContext) SparkExecutionPluginContext(io.cdap.cdap.etl.api.batch.SparkExecutionPluginContext) SparkPluginContext(io.cdap.cdap.etl.api.batch.SparkPluginContext) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) SparkStreamingExecutionContext(io.cdap.cdap.etl.spark.streaming.SparkStreamingExecutionContext) CountingFunction(io.cdap.cdap.etl.spark.function.CountingFunction) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) SparkExecutionPluginContext(io.cdap.cdap.etl.api.batch.SparkExecutionPluginContext) TxRunnable(io.cdap.cdap.api.TxRunnable) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) DatasetContext(io.cdap.cdap.api.data.DatasetContext) BasicSparkPluginContext(io.cdap.cdap.etl.spark.batch.BasicSparkPluginContext) SparkPluginContext(io.cdap.cdap.etl.api.batch.SparkPluginContext) BasicSparkPluginContext(io.cdap.cdap.etl.spark.batch.BasicSparkPluginContext)

Example 12 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class BatchSparkPipelineDriver method run.

@Override
public void run(DatasetContext context) throws Exception {
    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
    try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
        String object = reader.readLine();
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
        sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
        sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
        stagePartitions = sourceSinkInfo.getStagePartitions();
    }
    datasetContext = context;
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    Map<String, StageStatisticsCollector> collectors = new HashMap<>();
    if (phaseSpec.pipelineContainsCondition()) {
        Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
        while (iterator.hasNext()) {
            StageSpec spec = iterator.next();
            collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
        }
    }
    boolean isSuccessful = true;
    try {
        PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
        boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
        boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
        boolean shouldDisablePushdown = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.DISABLE_ELT_PUSHDOWN, Boolean.FALSE.toString()));
        boolean isPreviewEnabled = phaseSpec.getPhase().size() == 0 || sec.getDataTracer(phaseSpec.getPhase().iterator().next().getName()).isEnabled();
        // Initialize SQL engine instance if needed.
        if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null && !shouldDisablePushdown) {
            String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
            // Instantiate SQL engine and prepare run.
            try {
                MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
                Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
                sqlEngineAdapter = new BatchSQLEngineAdapter((SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
                sqlEngineAdapter.prepareRun();
            } catch (InstantiationException ie) {
                LOG.error("Could not create plugin instance for SQLEngine class", ie);
            } finally {
                if (sqlEngineAdapter == null) {
                    LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
                }
            }
        }
        runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
    } catch (Throwable t) {
        // Mark this execution as not successful.
        isSuccessful = false;
        // Rethrow
        throw t;
    } finally {
        updateWorkflowToken(sec.getWorkflowToken(), collectors);
        // Close SQL Engine Adapter if neeeded,
        if (sqlEngineAdapter != null) {
            sqlEngineAdapter.onRunFinish(isSuccessful);
            sqlEngineAdapter.close();
        }
    }
}
Also used : Path(java.nio.file.Path) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) HashMap(java.util.HashMap) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) BufferedReader(java.io.BufferedReader) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 13 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.speculation", "false");
    // turn off auto-broadcast by default until we better understand the implications and can set this to a
    // value that we are confident is safe.
    sparkConf.set("spark.sql.autoBroadcastJoinThreshold", "-1");
    sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
    sparkConf.set("spark.network.timeout", "600s");
    // Disable yarn app retries since spark already performs retries at a task level.
    sparkConf.set("spark.yarn.maxAppAttempts", "1");
    // to make sure fields that are the same but different casing are treated as different fields in auto-joins
    // see CDAP-17024
    sparkConf.set("spark.sql.caseSensitive", "true");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    SparkPreparer preparer = new SparkPreparer(context, context.getMetrics(), evaluator, pipelineRuntime);
    List<Finisher> finishers = preparer.prepare(phaseSpec);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 14 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class SparkPipelineRunner method runPipeline.

public void runPipeline(PhaseSpec phaseSpec, String sourcePluginType, JavaSparkExecutionContext sec, Map<String, Integer> stagePartitions, PluginContext pluginContext, Map<String, StageStatisticsCollector> collectors, Set<String> uncombinableSinks, boolean consolidateStages, boolean cacheFunctions) throws Exception {
    PipelinePhase pipelinePhase = phaseSpec.getPhase();
    BasicArguments arguments = new BasicArguments(sec);
    FunctionCache.Factory functionCacheFactory = FunctionCache.Factory.newInstance(cacheFunctions);
    MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(arguments, sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
    Map<String, EmittedRecords> emittedRecords = new HashMap<>();
    // should never happen, but removes warning
    if (pipelinePhase.getDag() == null) {
        throw new IllegalStateException("Pipeline phase has no connections.");
    }
    Set<String> uncombinableStages = new HashSet<>(uncombinableSinks);
    for (String uncombinableType : UNCOMBINABLE_PLUGIN_TYPES) {
        pipelinePhase.getStagesOfType(uncombinableType).stream().map(StageSpec::getName).forEach(s -> uncombinableStages.add(s));
    }
    CombinerDag groupedDag = new CombinerDag(pipelinePhase.getDag(), uncombinableStages);
    Map<String, Set<String>> groups = consolidateStages ? groupedDag.groupNodes() : Collections.emptyMap();
    if (!groups.isEmpty()) {
        LOG.debug("Stage consolidation is on.");
        int groupNum = 1;
        for (Set<String> group : groups.values()) {
            LOG.debug("Group{}: {}", groupNum, group);
            groupNum++;
        }
    }
    Set<String> branchers = new HashSet<>();
    for (String stageName : groupedDag.getNodes()) {
        if (groupedDag.getNodeOutputs(stageName).size() > 1) {
            branchers.add(stageName);
        }
    }
    Set<String> shufflers = pipelinePhase.getStagesOfType(BatchAggregator.PLUGIN_TYPE).stream().map(StageSpec::getName).collect(Collectors.toSet());
    Collection<Runnable> sinkRunnables = new ArrayList<>();
    for (String stageName : groupedDag.getTopologicalOrder()) {
        if (groups.containsKey(stageName)) {
            sinkRunnables.add(handleGroup(sec, phaseSpec, groups.get(stageName), groupedDag.getNodeInputs(stageName), emittedRecords, collectors));
            continue;
        }
        StageSpec stageSpec = pipelinePhase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        EmittedRecords.Builder emittedBuilder = EmittedRecords.builder();
        // don't want to do an additional filter for stages that can emit errors,
        // but aren't connected to an ErrorTransform
        // similarly, don't want to do an additional filter for alerts when the stage isn't connected to
        // an AlertPublisher
        boolean hasErrorOutput = false;
        boolean hasAlertOutput = false;
        Set<String> outputs = pipelinePhase.getStageOutputs(stageName);
        for (String output : outputs) {
            String outputPluginType = pipelinePhase.getStage(output).getPluginType();
            // noinspection ConstantConditions
            if (ErrorTransform.PLUGIN_TYPE.equals(outputPluginType)) {
                hasErrorOutput = true;
            } else if (AlertPublisher.PLUGIN_TYPE.equals(outputPluginType)) {
                hasAlertOutput = true;
            }
        }
        SparkCollection<Object> stageData = null;
        Map<String, SparkCollection<Object>> inputDataCollections = new HashMap<>();
        Set<String> stageInputs = pipelinePhase.getStageInputs(stageName);
        for (String inputStageName : stageInputs) {
            StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
            if (inputStageSpec == null) {
                // means the input to this stage is in a separate phase. For example, it is an action.
                continue;
            }
            String port = null;
            // not errors or alerts or output port records
            if (!Constants.Connector.PLUGIN_TYPE.equals(inputStageSpec.getPluginType()) && !Constants.Connector.PLUGIN_TYPE.equals(pluginType)) {
                port = inputStageSpec.getOutputPorts().get(stageName).getPort();
            }
            SparkCollection<Object> inputRecords = port == null ? emittedRecords.get(inputStageName).outputRecords : emittedRecords.get(inputStageName).outputPortRecords.get(port);
            inputDataCollections.put(inputStageName, inputRecords);
        }
        // initialize the stageRDD as the union of all input RDDs.
        if (!inputDataCollections.isEmpty()) {
            Iterator<SparkCollection<Object>> inputCollectionIter = inputDataCollections.values().iterator();
            stageData = inputCollectionIter.next();
            // don't union inputs records if we're joining or if we're processing errors
            while (!BatchJoiner.PLUGIN_TYPE.equals(pluginType) && !ErrorTransform.PLUGIN_TYPE.equals(pluginType) && inputCollectionIter.hasNext()) {
                stageData = stageData.union(inputCollectionIter.next());
            }
        }
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && pipelinePhase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && pipelinePhase.getSinks().contains(stageName);
        StageStatisticsCollector collector = collectors.get(stageName) == null ? new NoopStageStatisticsCollector() : collectors.get(stageName);
        PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
        if (stageData == null) {
            // null in the other else-if conditions
            if (sourcePluginType.equals(pluginType) || isConnectorSource) {
                SparkCollection<RecordInfo<Object>> combinedData = getSource(stageSpec, functionCacheFactory, collector);
                emittedBuilder = addEmitted(emittedBuilder, pipelinePhase, stageSpec, combinedData, groupedDag, branchers, shufflers, hasErrorOutput, hasAlertOutput);
            } else {
                throw new IllegalStateException(String.format("Stage '%s' has no input and is not a source.", stageName));
            }
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            sinkRunnables.add(stageData.createStoreTask(stageSpec, new BatchSinkFunction(pluginFunctionContext, functionCacheFactory.newCache())));
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            SparkSink<Object> sparkSink = pluginContext.newPluginInstance(stageName, macroEvaluator);
            sinkRunnables.add(stageData.createStoreTask(stageSpec, sparkSink));
        } else if (AlertPublisher.PLUGIN_TYPE.equals(pluginType)) {
            // union all the alerts coming into this stage
            SparkCollection<Alert> inputAlerts = null;
            for (String inputStage : stageInputs) {
                SparkCollection<Alert> inputErrorsFromStage = emittedRecords.get(inputStage).alertRecords;
                if (inputErrorsFromStage == null) {
                    continue;
                }
                if (inputAlerts == null) {
                    inputAlerts = inputErrorsFromStage;
                } else {
                    inputAlerts = inputAlerts.union(inputErrorsFromStage);
                }
            }
            if (inputAlerts != null) {
                inputAlerts.publishAlerts(stageSpec, collector);
            }
        } else if (ErrorTransform.PLUGIN_TYPE.equals(pluginType)) {
            // union all the errors coming into this stage
            SparkCollection<ErrorRecord<Object>> inputErrors = null;
            for (String inputStage : stageInputs) {
                SparkCollection<ErrorRecord<Object>> inputErrorsFromStage = emittedRecords.get(inputStage).errorRecords;
                if (inputErrorsFromStage == null) {
                    continue;
                }
                if (inputErrors == null) {
                    inputErrors = inputErrorsFromStage;
                } else {
                    inputErrors = inputErrors.union(inputErrorsFromStage);
                }
            }
            if (inputErrors != null) {
                SparkCollection<RecordInfo<Object>> combinedData = inputErrors.flatMap(stageSpec, new ErrorTransformFunction<Object, Object>(pluginFunctionContext, functionCacheFactory.newCache()));
                emittedBuilder = addEmitted(emittedBuilder, pipelinePhase, stageSpec, combinedData, groupedDag, branchers, shufflers, hasErrorOutput, hasAlertOutput);
            }
        } else {
            Object plugin = pluginContext.newPluginInstance(stageName, macroEvaluator);
            Optional<EmittedRecords.Builder> declarativeBuilder = tryRelationalTransform(pipelinePhase, groupedDag, branchers, shufflers, stageName, stageSpec, emittedBuilder, hasErrorOutput, hasAlertOutput, stageData, inputDataCollections, plugin);
            if (declarativeBuilder.isPresent()) {
                emittedBuilder = declarativeBuilder.get();
            } else {
                emittedBuilder = transform(emittedBuilder, stagePartitions, pipelinePhase, functionCacheFactory, groupedDag, branchers, shufflers, stageName, stageSpec, pluginType, hasErrorOutput, hasAlertOutput, stageData, inputDataCollections, collector, pluginFunctionContext, plugin);
            }
        }
        emittedRecords.put(stageName, emittedBuilder.build());
    }
    boolean shouldWriteInParallel = Boolean.parseBoolean(sec.getRuntimeArguments().get("pipeline.spark.parallel.sinks.enabled"));
    if (!shouldWriteInParallel) {
        for (Runnable runnable : sinkRunnables) {
            runnable.run();
        }
        return;
    }
    Collection<Future> sinkFutures = new ArrayList<>(sinkRunnables.size());
    ExecutorService executorService = Executors.newFixedThreadPool(sinkRunnables.size(), new ThreadFactoryBuilder().setNameFormat("pipeline-sink-task").build());
    for (Runnable runnable : sinkRunnables) {
        sinkFutures.add(executorService.submit(runnable));
    }
    Throwable error = null;
    Iterator<Future> futureIter = sinkFutures.iterator();
    for (Future future : sinkFutures) {
        try {
            future.get();
        } catch (ExecutionException e) {
            error = e.getCause();
            break;
        } catch (InterruptedException e) {
            break;
        }
    }
    executorService.shutdownNow();
    if (error != null) {
        throw Throwables.propagate(error);
    }
}
Also used : DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ArrayList(java.util.ArrayList) PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) ExecutionException(java.util.concurrent.ExecutionException) FunctionCache(io.cdap.cdap.etl.spark.function.FunctionCache) HashSet(java.util.HashSet) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) RecordInfo(io.cdap.cdap.etl.common.RecordInfo) CombinerDag(io.cdap.cdap.etl.planner.CombinerDag) BatchSinkFunction(io.cdap.cdap.etl.spark.function.BatchSinkFunction) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) ErrorRecord(io.cdap.cdap.etl.api.ErrorRecord)

Example 15 with DefaultMacroEvaluator

use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.

the class StreamingMultiSinkFunction method call.

@Override
public void call(JavaRDD<RecordInfo<Object>> data, Time batchTime) throws Exception {
    long logicalStartTime = batchTime.milliseconds();
    MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(sec), logicalStartTime, sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
    PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec, logicalStartTime);
    Map<String, SubmitterLifecycle<?>> stages = createStages(evaluator);
    // call prepareRun() on all the stages in the group
    // need to call it in an order that guarantees that inputs are called before outputs
    // this is because plugins can call getArguments().set() in the prepareRun() method,
    // which downstream stages should be able to read
    List<String> traversalOrder = new ArrayList(group.size());
    for (String stageName : phaseSpec.getPhase().getDag().getTopologicalOrder()) {
        if (group.contains(stageName)) {
            traversalOrder.add(stageName);
        }
    }
    for (String stageName : traversalOrder) {
        SubmitterLifecycle<?> plugin = stages.get(stageName);
        StageSpec stageSpec = phaseSpec.getPhase().getStage(stageName);
        try {
            prepareRun(pipelineRuntime, sinkFactory, stageSpec, plugin);
        } catch (Exception e) {
            LOG.error("Error preparing sink {} for the batch for time {}.", stageName, logicalStartTime, e);
            return;
        }
    }
    // run the actual transforms and sinks in this group
    boolean ranSuccessfully = true;
    try {
        MultiSinkFunction multiSinkFunction = new MultiSinkFunction(sec, phaseSpec, group, collectors);
        Set<String> outputNames = sinkFactory.writeCombinedRDD(data.flatMapToPair(multiSinkFunction), sec, sinkNames);
        sec.execute(new TxRunnable() {

            @Override
            public void run(DatasetContext context) throws Exception {
                for (String outputName : outputNames) {
                    ExternalDatasets.registerLineage(sec.getAdmin(), outputName, AccessType.WRITE, null, () -> context.getDataset(outputName));
                }
            }
        });
    } catch (Exception e) {
        LOG.error("Error writing to sinks {} for the batch for time {}.", sinkNames, logicalStartTime, e);
        ranSuccessfully = false;
    }
    // run onRunFinish() for each sink
    for (String stageName : traversalOrder) {
        SubmitterLifecycle<?> plugin = stages.get(stageName);
        StageSpec stageSpec = phaseSpec.getPhase().getStage(stageName);
        try {
            onRunFinish(pipelineRuntime, sinkFactory, stageSpec, plugin, ranSuccessfully);
        } catch (Exception e) {
            LOG.warn("Unable to execute onRunFinish for sink {}", stageName, e);
        }
    }
}
Also used : SubmitterLifecycle(io.cdap.cdap.etl.api.SubmitterLifecycle) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(io.cdap.cdap.api.plugin.PluginContext) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) ArrayList(java.util.ArrayList) MultiSinkFunction(io.cdap.cdap.etl.spark.function.MultiSinkFunction) TransactionFailureException(org.apache.tephra.TransactionFailureException) SparkPipelinePluginContext(io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext) SparkBatchSinkFactory(io.cdap.cdap.etl.spark.batch.SparkBatchSinkFactory) TxRunnable(io.cdap.cdap.api.TxRunnable) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) DatasetContext(io.cdap.cdap.api.data.DatasetContext)

Aggregations

DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)40 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)38 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)26 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)22 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)20 HashMap (java.util.HashMap)16 Map (java.util.Map)16 PluginContext (io.cdap.cdap.api.plugin.PluginContext)14 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)14 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)12 SparkPipelineRuntime (io.cdap.cdap.etl.spark.SparkPipelineRuntime)10 SparkPipelinePluginContext (io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext)10 MacroParserOptions (io.cdap.cdap.api.macro.MacroParserOptions)8 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)8 AlertPublisher (io.cdap.cdap.etl.api.AlertPublisher)6 OAuthMacroEvaluator (io.cdap.cdap.etl.common.OAuthMacroEvaluator)6 SecureStoreMacroEvaluator (io.cdap.cdap.etl.common.SecureStoreMacroEvaluator)6 TxRunnable (io.cdap.cdap.api.TxRunnable)5 DatasetContext (io.cdap.cdap.api.data.DatasetContext)5 ArrayList (java.util.ArrayList)5