Search in sources :

Example 1 with StageStatisticsCollector

use of io.cdap.cdap.etl.common.StageStatisticsCollector in project cdap by caskdata.

the class TransformExecutorFactory method getTransformation.

@SuppressWarnings("unchecked")
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
    String stageName = stageSpec.getName();
    String pluginType = stageSpec.getPluginType();
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
    StageStatisticsCollector collector = collectStageStatistics ? getStatisticsCollector(stageName) : NoopStageStatisticsCollector.INSTANCE;
    Transformation transformation = getInitializedTransformation(stageSpec);
    // we emit metrics for records into alert publishers when the actual alerts are published,
    // not when we write the alerts to the temporary dataset
    String recordsInMetric = AlertPublisher.PLUGIN_TYPE.equals(pluginType) ? null : Constants.Metrics.RECORDS_IN;
    return new TrackedTransform<>(transformation, stageMetrics, recordsInMetric, Constants.Metrics.RECORDS_OUT, getDataTracer(stageName), collector);
}
Also used : TrackedTransform(io.cdap.cdap.etl.common.TrackedTransform) Transformation(io.cdap.cdap.etl.api.Transformation) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics)

Example 2 with StageStatisticsCollector

use of io.cdap.cdap.etl.common.StageStatisticsCollector in project cdap by caskdata.

the class BatchSparkPipelineDriver method run.

@Override
public void run(DatasetContext context) throws Exception {
    BatchPhaseSpec phaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
    Path configFile = sec.getLocalizationContext().getLocalFile("HydratorSpark.config").toPath();
    try (BufferedReader reader = Files.newBufferedReader(configFile, StandardCharsets.UTF_8)) {
        String object = reader.readLine();
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = GSON.fromJson(object, SparkBatchSourceSinkFactoryInfo.class);
        sourceFactory = sourceSinkInfo.getSparkBatchSourceFactory();
        sinkFactory = sourceSinkInfo.getSparkBatchSinkFactory();
        stagePartitions = sourceSinkInfo.getStagePartitions();
    }
    datasetContext = context;
    PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    Map<String, StageStatisticsCollector> collectors = new HashMap<>();
    if (phaseSpec.pipelineContainsCondition()) {
        Iterator<StageSpec> iterator = phaseSpec.getPhase().iterator();
        while (iterator.hasNext()) {
            StageSpec spec = iterator.next();
            collectors.put(spec.getName(), new SparkStageStatisticsCollector(jsc));
        }
    }
    boolean isSuccessful = true;
    try {
        PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pluginContext, sec.getMetrics(), phaseSpec, new SingleConnectorFactory());
        boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
        boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
        boolean isPreviewEnabled = phaseSpec.getPhase().size() == 0 || sec.getDataTracer(phaseSpec.getPhase().iterator().next().getName()).isEnabled();
        // Initialize SQL engine instance if needed.
        if (!isPreviewEnabled && phaseSpec.getSQLEngineStageSpec() != null) {
            String sqlEngineStage = SQLEngineUtils.buildStageName(phaseSpec.getSQLEngineStageSpec().getPlugin().getName());
            // Instantiate SQL engine and prepare run.
            try {
                MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
                Object instance = pluginInstantiator.newPluginInstance(sqlEngineStage, macroEvaluator);
                sqlEngineAdapter = new BatchSQLEngineAdapter((SQLEngine<?, ?, ?, ?>) instance, sec, jsc, collectors);
                sqlEngineAdapter.prepareRun();
            } catch (InstantiationException ie) {
                LOG.error("Could not create plugin instance for SQLEngine class", ie);
            } finally {
                if (sqlEngineAdapter == null) {
                    LOG.warn("Could not instantiate SQLEngine instance for Transformation Pushdown");
                }
            }
        }
        runPipeline(phaseSpec, BatchSource.PLUGIN_TYPE, sec, stagePartitions, pluginInstantiator, collectors, sinkFactory.getUncombinableSinks(), shouldConsolidateStages, shouldCacheFunctions);
    } catch (Throwable t) {
        // Mark this execution as not successful.
        isSuccessful = false;
        // Rethrow
        throw t;
    } finally {
        updateWorkflowToken(sec.getWorkflowToken(), collectors);
        // Close SQL Engine Adapter if neeeded,
        if (sqlEngineAdapter != null) {
            sqlEngineAdapter.onRunFinish(isSuccessful);
            sqlEngineAdapter.close();
        }
    }
}
Also used : Path(java.nio.file.Path) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) HashMap(java.util.HashMap) SingleConnectorFactory(io.cdap.cdap.etl.batch.connector.SingleConnectorFactory) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SQLEngine(io.cdap.cdap.etl.api.engine.sql.SQLEngine) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) BufferedReader(java.io.BufferedReader) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) PipelinePluginInstantiator(io.cdap.cdap.etl.batch.PipelinePluginInstantiator) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Example 3 with StageStatisticsCollector

use of io.cdap.cdap.etl.common.StageStatisticsCollector in project cdap by caskdata.

the class BatchSparkPipelineDriver method updateWorkflowToken.

private void updateWorkflowToken(WorkflowToken token, Map<String, StageStatisticsCollector> collectors) {
    for (Map.Entry<String, StageStatisticsCollector> entry : collectors.entrySet()) {
        SparkStageStatisticsCollector collector = (SparkStageStatisticsCollector) entry.getValue();
        String keyPrefix = Constants.StageStatistics.PREFIX + "." + entry.getKey() + ".";
        String inputRecordKey = keyPrefix + Constants.StageStatistics.INPUT_RECORDS;
        token.put(inputRecordKey, String.valueOf(collector.getInputRecordCount()));
        String outputRecordKey = keyPrefix + Constants.StageStatistics.OUTPUT_RECORDS;
        token.put(outputRecordKey, String.valueOf(collector.getOutputRecordCount()));
        String errorRecordKey = keyPrefix + Constants.StageStatistics.ERROR_RECORDS;
        token.put(errorRecordKey, String.valueOf(collector.getErrorRecordCount()));
    }
}
Also used : SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) SparkStageStatisticsCollector(io.cdap.cdap.etl.spark.SparkStageStatisticsCollector) Map(java.util.Map) HashMap(java.util.HashMap)

Example 4 with StageStatisticsCollector

use of io.cdap.cdap.etl.common.StageStatisticsCollector in project cdap by caskdata.

the class MapReduceTransformExecutorFactory method getTransformation.

@SuppressWarnings("unchecked")
@Override
protected <IN, OUT> TrackedTransform<IN, OUT> getTransformation(StageSpec stageSpec) throws Exception {
    String stageName = stageSpec.getName();
    String pluginType = stageSpec.getPluginType();
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
    TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
    StageStatisticsCollector collector = collectStageStatistics ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
    if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchAggregator<?, ?, ?> batchAggregator;
        if (plugin instanceof BatchReducibleAggregator) {
            BatchReducibleAggregator<?, ?, ?, ?> reducibleAggregator = (BatchReducibleAggregator<?, ?, ?, ?>) plugin;
            batchAggregator = new AggregatorBridge<>(reducibleAggregator);
        } else {
            batchAggregator = (BatchAggregator<?, ?, ?>) plugin;
        }
        BatchRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchAggregator.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, getDataTracer(stageName), collector);
        }
    } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
        Object plugin = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchJoiner<?, ?, ?> batchJoiner;
        Set<String> filterNullKeyStages = new HashSet<>();
        if (plugin instanceof BatchAutoJoiner) {
            BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
            FailureCollector failureCollector = new LoggingFailureCollector(stageName, stageSpec.getInputSchemas());
            DefaultAutoJoinerContext context = DefaultAutoJoinerContext.from(stageSpec.getInputSchemas(), failureCollector);
            // definition will be non-null due to validate by PipelinePhasePreparer at the start of the run
            JoinDefinition joinDefinition = autoJoiner.define(context);
            JoinCondition condition = joinDefinition.getCondition();
            // should never happen as it's checked at deployment time, but add this to be safe.
            if (condition.getOp() != JoinCondition.Op.KEY_EQUALITY) {
                failureCollector.addFailure(String.format("Join stage '%s' uses a %s condition, which is not supported with the MapReduce engine.", stageName, condition.getOp()), "Switch to a different execution engine.");
            }
            failureCollector.getOrThrowException();
            batchJoiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
            // this is the same as filtering out records that have a null key if they are from an optional stage
            if (condition.getOp() == JoinCondition.Op.KEY_EQUALITY && !((JoinCondition.OnKeys) condition).isNullSafe()) {
                filterNullKeyStages = joinDefinition.getStages().stream().filter(s -> !s.isRequired()).map(JoinStage::getStageName).collect(Collectors.toSet());
            }
        } else {
            batchJoiner = (BatchJoiner<?, ?, ?>) plugin;
        }
        BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageSpec);
        batchJoiner.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, filterNullKeyStages), stageMetrics, getDataTracer(stageName), collector);
        } else {
            return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, getDataTracer(stageName), collector);
        }
    }
    return super.getTransformation(stageSpec);
}
Also used : LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) Set(java.util.Set) HashSet(java.util.HashSet) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) BatchRuntimeContext(io.cdap.cdap.etl.api.batch.BatchRuntimeContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) BatchReducibleAggregator(io.cdap.cdap.etl.api.batch.BatchReducibleAggregator) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge) BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) JoinCondition(io.cdap.cdap.etl.api.join.JoinCondition) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector)

Example 5 with StageStatisticsCollector

use of io.cdap.cdap.etl.common.StageStatisticsCollector in project cdap by caskdata.

the class BatchSQLEngineAdapter method joinInternal.

/**
 * Join implementation. This method has blocking calls and should be executed in a separate thread.
 *
 * @param joinRequest the Join Request
 * @return
 * @throws SQLEngineException if any of the preceding jobs fails.
 */
private SQLDataset joinInternal(SQLJoinRequest joinRequest) throws SQLEngineException {
    String datasetName = joinRequest.getDatasetName();
    DefaultStageMetrics stageMetrics = new DefaultStageMetrics(metrics, datasetName);
    StageStatisticsCollector statisticsCollector = statsCollectors.get(datasetName);
    // Count input metrics for each of the preceding stages.
    for (SQLDataset inputDataset : joinRequest.getInputDatasets()) {
        countRecordsIn(inputDataset, statisticsCollector, stageMetrics);
    }
    // Execute Join job.
    SQLDataset joinDataset = (SQLDataset) sqlEngine.join(joinRequest);
    // Count output rows and complete future.
    countRecordsOut(joinDataset, statisticsCollector, stageMetrics);
    return joinDataset;
}
Also used : SQLDataset(io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset) StageStatisticsCollector(io.cdap.cdap.etl.common.StageStatisticsCollector) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics)

Aggregations

StageStatisticsCollector (io.cdap.cdap.etl.common.StageStatisticsCollector)9 DefaultStageMetrics (io.cdap.cdap.etl.common.DefaultStageMetrics)6 StageMetrics (io.cdap.cdap.etl.api.StageMetrics)4 NoopStageStatisticsCollector (io.cdap.cdap.etl.common.NoopStageStatisticsCollector)4 HashMap (java.util.HashMap)4 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)3 HashSet (java.util.HashSet)3 Set (java.util.Set)3 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)2 SQLEngine (io.cdap.cdap.etl.api.engine.sql.SQLEngine)2 SQLDataset (io.cdap.cdap.etl.api.engine.sql.dataset.SQLDataset)2 SQLWriteRequest (io.cdap.cdap.etl.api.engine.sql.request.SQLWriteRequest)2 SQLWriteResult (io.cdap.cdap.etl.api.engine.sql.request.SQLWriteResult)2 SQLEngineWriteJobKey (io.cdap.cdap.etl.engine.SQLEngineWriteJobKey)2 SparkStageStatisticsCollector (io.cdap.cdap.etl.spark.SparkStageStatisticsCollector)2 ArrayList (java.util.ArrayList)2 ExecutorService (java.util.concurrent.ExecutorService)2 Objects (com.google.common.base.Objects)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1