Search in sources :

Example 1 with BatchPhaseSpec

use of io.cdap.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class SmartWorkflow method getPhaseSpec.

private BatchPhaseSpec getPhaseSpec(String programName, PipelinePhase phase) {
    Map<String, String> phaseConnectorDatasets = new HashMap<>();
    // if this phase uses connectors, add the local dataset for that connector if we haven't already
    for (StageSpec connectorInfo : phase.getStagesOfType(Constants.Connector.PLUGIN_TYPE)) {
        String connectorName = connectorInfo.getName();
        if (!connectorDatasets.containsKey(connectorName)) {
            String datasetName = "conn-" + connectorNum++;
            connectorDatasets.put(connectorName, datasetName);
            phaseConnectorDatasets.put(connectorName, datasetName);
            // add the local dataset
            ConnectorSource connectorSource = new MultiConnectorSource(datasetName, null);
            connectorSource.configure(getConfigurer());
        } else {
            phaseConnectorDatasets.put(connectorName, connectorDatasets.get(connectorName));
        }
    }
    // published.
    for (StageSpec alertPublisherInfo : phase.getStagesOfType(AlertPublisher.PLUGIN_TYPE)) {
        String stageName = alertPublisherInfo.getName();
        if (!connectorDatasets.containsKey(stageName)) {
            String datasetName = "alerts-" + publisherNum++;
            connectorDatasets.put(alertPublisherInfo.getName(), datasetName);
            phaseConnectorDatasets.put(alertPublisherInfo.getName(), datasetName);
            AlertPublisherSink alertPublisherSink = new AlertPublisherSink(datasetName, null);
            alertPublisherSink.configure(getConfigurer());
        } else {
            phaseConnectorDatasets.put(stageName, connectorDatasets.get(stageName));
        }
    }
    return new BatchPhaseSpec(programName, phase, spec.getResources(), spec.getDriverResources(), spec.getClientResources(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled(), phaseConnectorDatasets, spec.getNumOfRecordsPreview(), spec.getProperties(), !plan.getConditionPhaseBranches().isEmpty(), spec.getSqlEngineStageSpec());
}
Also used : MultiConnectorSource(io.cdap.cdap.etl.batch.connector.MultiConnectorSource) ConnectorSource(io.cdap.cdap.etl.batch.connector.ConnectorSource) MultiConnectorSource(io.cdap.cdap.etl.batch.connector.MultiConnectorSource) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) AlertPublisherSink(io.cdap.cdap.etl.batch.connector.AlertPublisherSink)

Example 2 with BatchPhaseSpec

use of io.cdap.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class BatchSparkPipelineDriver method run.

@Override
public void run(DatasetContext context) throws Exception {
    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
    try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
        String object = reader.readLine();
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
        sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
        sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
        stagePartitions = sourceSinkInfo.getStagePartitions();
    }
    datasetContext = context;
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    Map<String, StageStatisticsCollector> collectors = new HashMap<>();
    if (phaseSpec.pipelineContainsCondition()) {
        Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
        while (iterator.hasNext()) {
            StageSpec spec = iterator.next();
            collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
        }
    }
    boolean isSuccessful = true;
    try {
        PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
        boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
        boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
        boolean isPreviewEnabled = phaseSpec.getPhase().size() == 0 || sec.getDataTracer(phaseSpec.getPhase().iterator().next().getName()).isEnabled();
        // Initialize SQL engine instance if needed.
        if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null) {
            String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
            // Instantiate SQL engine and prepare run.
            try {
                MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
                Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
                sqlEngineAdapter = new BatchSQLEngineAdapter((SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
                sqlEngineAdapter.prepareRun();
            } catch (InstantiationException ie) {
                LOG.error("Could not create plugin instance for SQLEngine class", ie);
            } finally {
                if (sqlEngineAdapter == null) {
                    LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
                }
            }
        }
        runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
    } catch (Throwable t) {
        // Mark this execution as not successful.
        isSuccessful = false;
        // Rethrow
        throw t;
    } finally {
        updateWorkflowToken(sec.getWorkflowToken(), collectors);
        // Close SQL Engine Adapter if neeeded,
        if (sqlEngineAdapter != null) {
            sqlEngineAdapter.onRunFinish(isSuccessful);
            sqlEngineAdapter.close();
        }
    }
}
Also used : Path(java.nio.file.Path) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) HashMap(java.util.HashMap) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) BufferedReader(java.io.BufferedReader) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 3 with BatchPhaseSpec

use of io.cdap.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.speculation", "false");
    // turn off auto-broadcast by default until we better understand the implications and can set this to a
    // value that we are confident is safe.
    sparkConf.set("spark.sql.autoBroadcastJoinThreshold", "-1");
    sparkConf.set("spark.maxRemoteBlockSizeFetchToMem", String.valueOf(Integer.MAX_VALUE - 512));
    sparkConf.set("spark.network.timeout", "600s");
    // Disable yarn app retries since spark already performs retries at a task level.
    sparkConf.set("spark.yarn.maxAppAttempts", "1");
    // to make sure fields that are the same but different casing are treated as different fields in auto-joins
    // see CDAP-17024
    sparkConf.set("spark.sql.caseSensitive", "true");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    SparkPreparer preparer = new SparkPreparer(context, context.getMetrics(), evaluator, pipelineRuntime);
    List<Finisher> finishers = preparer.prepare(phaseSpec);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) SparkClientContext(io.cdap.cdap.api.spark.SparkClientContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Example 4 with BatchPhaseSpec

use of io.cdap.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class PipelineAction method run.

@Override
public void run() throws Exception {
    CustomActionContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    StageSpec stageSpec = phase.iterator().next();
    PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
    Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace()));
    ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
    if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
        action.run(actionContext);
    }
    WorkflowToken token = context.getWorkflowToken();
    if (token == null) {
        throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
    }
    for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
        token.put(entry.getKey(), entry.getValue());
    }
}
Also used : Action(io.cdap.cdap.etl.api.action.Action) AbstractCustomAction(io.cdap.cdap.api.customaction.AbstractCustomAction) CustomAction(io.cdap.cdap.api.customaction.CustomAction) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) PluginContext(io.cdap.cdap.api.plugin.PluginContext) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) ActionContext(io.cdap.cdap.etl.api.action.ActionContext) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 5 with BatchPhaseSpec

use of io.cdap.cdap.etl.batch.BatchPhaseSpec in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    // should never happen if planner is correct
    Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
        Iterator<StageSpec> reducerIter = reducers.iterator();
        StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
        while (reducerIter.hasNext()) {
            reducersStr.append(",");
            reducersStr.append(reducerIter.next().getName());
        }
        throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
    }
    job.setMapperClass(ETLMapper.class);
    if (reducers.isEmpty()) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(ETLReducer.class);
    }
    // instantiate plugins and call their prepare methods
    Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
    MapReducePreparer preparer = new MapReducePreparer(context, mrMetrics, evaluator, pipelineRuntime, connectorDatasets);
    List<Finisher> finishers = preparer.prepare(phaseSpec, job);
    finisher = new CompositeFinisher(finishers);
}
Also used : PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) Configuration(org.apache.hadoop.conf.Configuration) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) MapReduceContext(io.cdap.cdap.api.mapreduce.MapReduceContext) CompositeFinisher(io.cdap.cdap.etl.common.submit.CompositeFinisher) Finisher(io.cdap.cdap.etl.common.submit.Finisher) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) Job(org.apache.hadoop.mapreduce.Job) HashMap(java.util.HashMap) Map(java.util.Map) TransactionPolicy(io.cdap.cdap.api.annotation.TransactionPolicy)

Aggregations

BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)9 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)7 HashMap (java.util.HashMap)7 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)6 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)6 Map (java.util.Map)5 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)4 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)4 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)4 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)3 TransactionPolicy (io.cdap.cdap.api.annotation.TransactionPolicy)2 PluginContext (io.cdap.cdap.api.plugin.PluginContext)2 SparkClientContext (io.cdap.cdap.api.spark.SparkClientContext)2 WorkflowToken (io.cdap.cdap.api.workflow.WorkflowToken)2 CompositeFinisher (io.cdap.cdap.etl.common.submit.CompositeFinisher)2 Finisher (io.cdap.cdap.etl.common.submit.Finisher)2 SparkConf (org.apache.spark.SparkConf)2 AbstractCustomAction (io.cdap.cdap.api.customaction.AbstractCustomAction)1 CustomAction (io.cdap.cdap.api.customaction.CustomAction)1 CustomActionContext (io.cdap.cdap.api.customaction.CustomActionContext)1