Search in sources :

Example 1 with DefaultStageMetrics

use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.

the class ETLWorker method run.

@Override
public void run() {
    final SourceState currentState = new SourceState();
    final SourceState nextState = new SourceState();
    final Map<String, List<Object>> dataToSink = new HashMap<>();
    boolean hasData = false;
    final Map<String, List<InvalidEntry>> transformIdToErrorRecords = intializeTransformIdToErrorsList();
    final WorkerContext context = getContext();
    Set<String> transformErrorsWithoutDataset = Sets.newHashSet();
    // Fetch SourceState from State Table.
    // Only required at the beginning since we persist the state if there is a change.
    Transactionals.execute(context, new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
            byte[] stateBytes = stateTable.read(stateStoreKeyBytes);
            if (stateBytes != null) {
                SourceState state = GSON.fromJson(Bytes.toString(stateBytes), SourceState.class);
                currentState.setState(state);
            }
        }
    });
    DefaultEmitter<Object> sourceEmitter = new DefaultEmitter<>();
    TrackedEmitter<Object> trackedSourceEmitter = new TrackedEmitter<>(sourceEmitter, new DefaultStageMetrics(metrics, sourceStageName), TrackedTransform.RECORDS_OUT, context.getDataTracer(sourceStageName));
    while (!stopped) {
        // Invoke poll method of the source to fetch data
        try {
            SourceState newState = source.poll(trackedSourceEmitter, new SourceState(currentState));
            if (newState != null) {
                nextState.setState(newState);
            }
        } catch (Exception e) {
            // Continue since the source threw an exception. No point in processing records and state is not changed.
            LOG.warn("Exception thrown during polling of Source for data", e);
            sourceEmitter.reset();
            continue;
        }
        // to be persisted in the sink.
        for (Object sourceData : sourceEmitter.getEntries()) {
            try {
                TransformResponse transformResponse = transformExecutor.runOneIteration(sourceData);
                for (Map.Entry<String, Collection<Object>> transformedValues : transformResponse.getSinksResults().entrySet()) {
                    dataToSink.put(transformedValues.getKey(), new ArrayList<>());
                    Iterator emitterIterator = transformedValues.getValue().iterator();
                    while (emitterIterator.hasNext()) {
                        if (!hasData) {
                            hasData = true;
                        }
                        dataToSink.get(transformedValues.getKey()).add(emitterIterator.next());
                    }
                }
                for (Map.Entry<String, Collection<InvalidEntry<Object>>> transformErrorsEntry : transformResponse.getMapTransformIdToErrorEmitter().entrySet()) {
                    if (!transformErrorsWithoutDataset.contains(transformErrorsEntry.getKey())) {
                        if (!tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
                            transformErrorsWithoutDataset.add(transformErrorsEntry.getKey());
                            LOG.warn("Error records were emitted in transform {}, " + "but error dataset is not configured for this transform", transformErrorsEntry.getKey());
                        }
                        if (tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
                            // add the errors
                            if (!hasData && transformErrorsEntry.getValue().size() > 0) {
                                hasData = true;
                            }
                            transformIdToErrorRecords.get(transformErrorsEntry.getKey()).addAll(transformErrorsEntry.getValue());
                        }
                    }
                }
            } catch (Exception e) {
                LOG.warn("Exception thrown while processing data {}", sourceData, e);
            }
        }
        sourceEmitter.reset();
        // Start a Transaction if there is data to persist or if the Source state has changed.
        try {
            if (hasData || (!nextState.equals(currentState))) {
                getContext().execute(new TxRunnable() {

                    @Override
                    public void run(DatasetContext context) throws Exception {
                        // Invoke the sink's write method if there is any object to be written.
                        if (!dataToSink.isEmpty()) {
                            DefaultDataWriter defaultDataWriter = new DefaultDataWriter(getContext(), context);
                            for (Map.Entry<String, List<Object>> sinkEntry : dataToSink.entrySet()) {
                                sinks.get(sinkEntry.getKey()).write(sinkEntry.getValue(), defaultDataWriter);
                            }
                        }
                        for (Map.Entry<String, List<InvalidEntry>> errorRecordEntry : transformIdToErrorRecords.entrySet()) {
                            String transformId = errorRecordEntry.getKey();
                            final String datasetName = tranformIdToDatasetName.get(transformId);
                            Table errorTable = context.getDataset(datasetName);
                            long timeInMillis = System.currentTimeMillis();
                            byte[] currentTime = Bytes.toBytes(timeInMillis);
                            String transformIdentifier = appName + SEPARATOR + transformId;
                            for (InvalidEntry invalidEntry : errorRecordEntry.getValue()) {
                                // using random uuid as we want to write each record uniquely,
                                // but we are not concerned about the uuid while scanning later.
                                byte[] rowKey = Bytes.concat(currentTime, Bytes.toBytes(transformIdentifier), Bytes.toBytes(UUID.randomUUID()));
                                Put errorPut = constructErrorPut(rowKey, invalidEntry, timeInMillis);
                                errorTable.write(rowKey, errorPut);
                            }
                        }
                        // Persist nextState if it is different from currentState
                        if (!nextState.equals(currentState)) {
                            KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
                            stateTable.write(stateStoreKey, GSON.toJson(nextState));
                        }
                        // after running one iteration and succesfully writing to sinks and error datasets, reset the emitters.
                        transformExecutor.resetEmitter();
                    }
                });
                // Update the in-memory copy of the state only if the transaction succeeded.
                currentState.setState(nextState);
            }
        } catch (Exception e) {
            LOG.warn("Exception thrown during persisting of data", e);
        } finally {
            // Clear the persisted sink data (in case transaction failure occurred, we will poll the source with old state)
            hasData = false;
            dataToSink.clear();
            for (List<InvalidEntry> invalidEntryList : transformIdToErrorRecords.values()) {
                invalidEntryList.clear();
            }
        }
    }
}
Also used : DefaultEmitter(co.cask.cdap.etl.common.DefaultEmitter) HashMap(java.util.HashMap) InvalidEntry(co.cask.cdap.etl.api.InvalidEntry) TxRunnable(co.cask.cdap.api.TxRunnable) TrackedEmitter(co.cask.cdap.etl.common.TrackedEmitter) CloseableIterator(co.cask.cdap.api.dataset.lib.CloseableIterator) Iterator(java.util.Iterator) List(java.util.List) ArrayList(java.util.ArrayList) DatasetContext(co.cask.cdap.api.data.DatasetContext) InvalidEntry(co.cask.cdap.etl.api.InvalidEntry) SourceState(co.cask.cdap.etl.api.realtime.SourceState) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) IOException(java.io.IOException) Put(co.cask.cdap.api.dataset.table.Put) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Collection(java.util.Collection) TransformResponse(co.cask.cdap.etl.common.TransformResponse) WorkerContext(co.cask.cdap.api.worker.WorkerContext) Map(java.util.Map) HashMap(java.util.HashMap) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics)

Example 2 with DefaultStageMetrics

use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.

the class MapReduceTransformExecutorFactory method getMultiOutputTransform.

private <IN, ERROR> TrackedMultiOutputTransform<IN, ERROR> getMultiOutputTransform(StageSpec stageSpec) throws Exception {
    String stageName = stageSpec.getName();
    DefaultMacroEvaluator macroEvaluator = new DefaultMacroEvaluator(arguments, taskContext.getLogicalStartTime(), taskContext, taskContext.getNamespace());
    SplitterTransform<IN, ERROR> splitterTransform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
    TransformContext transformContext = createRuntimeContext(stageSpec);
    splitterTransform.initialize(transformContext);
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
    TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
    StageStatisticsCollector collector = isPipelineContainsCondition ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
    return new TrackedMultiOutputTransform<>(splitterTransform, stageMetrics, taskContext.getDataTracer(stageName), collector);
}
Also used : NoopStageStatisticsCollector(co.cask.cdap.etl.common.NoopStageStatisticsCollector) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) TrackedMultiOutputTransform(co.cask.cdap.etl.common.TrackedMultiOutputTransform) TransformContext(co.cask.cdap.etl.api.TransformContext) NoopStageStatisticsCollector(co.cask.cdap.etl.common.NoopStageStatisticsCollector) StageStatisticsCollector(co.cask.cdap.etl.common.StageStatisticsCollector) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) StageMetrics(co.cask.cdap.etl.api.StageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics)

Example 3 with DefaultStageMetrics

use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.

the class MapReduceTransformExecutorFactory method getTransformation.

@SuppressWarnings("unchecked")
@Override
protected TrackedTransform getTransformation(StageInfo stageInfo) throws Exception {
    DefaultMacroEvaluator macroEvaluator = new DefaultMacroEvaluator(taskContext.getWorkflowToken(), taskContext.getRuntimeArguments(), taskContext.getLogicalStartTime(), taskContext, taskContext.getNamespace());
    String stageName = stageInfo.getName();
    String pluginType = stageInfo.getPluginType();
    StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
    if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
        BatchAggregator<?, ?, ?> batchAggregator = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchRuntimeContext runtimeContext = createRuntimeContext(stageInfo);
        batchAggregator.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName));
        } else {
            return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName));
        }
    } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
        BatchJoiner<?, ?, ?> batchJoiner = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
        BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageInfo);
        batchJoiner.initialize(runtimeContext);
        if (isMapPhase) {
            return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName));
        } else {
            return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, taskContext.getDataTracer(stageName));
        }
    }
    Transformation transformation = getInitializedTransformation(stageInfo);
    boolean isLimitingSource = taskContext.getDataTracer(stageName).isEnabled() && BatchSource.PLUGIN_TYPE.equals(pluginType) && isMapPhase;
    return new TrackedTransform(isLimitingSource ? new LimitingTransform(transformation, numberOfRecordsPreview) : transformation, stageMetrics, taskContext.getDataTracer(stageName));
}
Also used : BatchJoinerRuntimeContext(co.cask.cdap.etl.api.batch.BatchJoinerRuntimeContext) TrackedTransform(co.cask.cdap.etl.common.TrackedTransform) Transformation(co.cask.cdap.etl.api.Transformation) LimitingTransform(co.cask.cdap.etl.common.preview.LimitingTransform) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchRuntimeContext(co.cask.cdap.etl.api.batch.BatchRuntimeContext) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) StageMetrics(co.cask.cdap.etl.api.StageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics)

Example 4 with DefaultStageMetrics

use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.

the class ETLWorker method initializeTransforms.

private void initializeTransforms(WorkerContext context, Map<String, TransformDetail> transformDetailMap, PipelinePhase pipeline) throws Exception {
    Set<StageInfo> transformInfos = pipeline.getStagesOfType(Transform.PLUGIN_TYPE);
    Preconditions.checkArgument(transformInfos != null);
    tranformIdToDatasetName = new HashMap<>(transformInfos.size());
    for (StageInfo transformInfo : transformInfos) {
        String transformName = transformInfo.getName();
        try {
            Transform<?, ?> transform = context.newPluginInstance(transformName);
            ;
            transform = new WrappedTransform<>(transform, Caller.DEFAULT);
            WorkerRealtimeContext transformContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), transformInfo);
            LOG.debug("Transform Class : {}", transform.getClass().getName());
            transform.initialize(transformContext);
            StageMetrics stageMetrics = new DefaultStageMetrics(metrics, transformName);
            transformDetailMap.put(transformName, new TransformDetail(new TrackedTransform<>(transform, stageMetrics, context.getDataTracer(transformName)), pipeline.getStageOutputs(transformName)));
            if (transformInfo.getErrorDatasetName() != null) {
                tranformIdToDatasetName.put(transformName, transformInfo.getErrorDatasetName());
            }
        } catch (InstantiationException e) {
            LOG.error("Unable to instantiate Transform", e);
            Throwables.propagate(e);
        }
    }
}
Also used : TrackedTransform(co.cask.cdap.etl.common.TrackedTransform) StageInfo(co.cask.cdap.etl.planner.StageInfo) TxLookupProvider(co.cask.cdap.etl.common.TxLookupProvider) TransformDetail(co.cask.cdap.etl.common.TransformDetail) StageMetrics(co.cask.cdap.etl.api.StageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics)

Example 5 with DefaultStageMetrics

use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.

the class ETLWorker method initializeSinks.

@SuppressWarnings("unchecked")
private void initializeSinks(WorkerContext context, Map<String, TransformDetail> transformationMap, PipelinePhase pipeline) throws Exception {
    Set<StageInfo> sinkInfos = pipeline.getStagesOfType(RealtimeSink.PLUGIN_TYPE);
    sinks = new HashMap<>(sinkInfos.size());
    for (StageInfo sinkInfo : sinkInfos) {
        String sinkName = sinkInfo.getName();
        RealtimeSink sink = context.newPluginInstance(sinkName);
        sink = new LoggedRealtimeSink(sinkName, sink);
        WorkerRealtimeContext sinkContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), sinkInfo);
        LOG.debug("Sink Class : {}", sink.getClass().getName());
        sink.initialize(sinkContext);
        sink = new TrackedRealtimeSink(sink, new DefaultStageMetrics(metrics, sinkName));
        Transformation identityTransformation = new Transformation() {

            @Override
            public void transform(Object input, Emitter emitter) throws Exception {
                emitter.emit(input);
            }
        };
        // we use identity transformation to simplify executing transformation in pipeline (similar to ETLMapreduce),
        // since we want to emit metrics during write to sink and not during this transformation, we use NoOpMetrics.
        TrackedTransform trackedTransform = new TrackedTransform(identityTransformation, new DefaultStageMetrics(metrics, sinkName), TrackedTransform.RECORDS_IN, null, context.getDataTracer(sinkName));
        transformationMap.put(sinkInfo.getName(), new TransformDetail(trackedTransform, new HashSet<String>()));
        sinks.put(sinkInfo.getName(), sink);
    }
}
Also used : TrackedTransform(co.cask.cdap.etl.common.TrackedTransform) Transformation(co.cask.cdap.etl.api.Transformation) Emitter(co.cask.cdap.etl.api.Emitter) DefaultEmitter(co.cask.cdap.etl.common.DefaultEmitter) TrackedEmitter(co.cask.cdap.etl.common.TrackedEmitter) StageInfo(co.cask.cdap.etl.planner.StageInfo) TxLookupProvider(co.cask.cdap.etl.common.TxLookupProvider) TransformDetail(co.cask.cdap.etl.common.TransformDetail) RealtimeSink(co.cask.cdap.etl.api.realtime.RealtimeSink) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics) HashSet(java.util.HashSet)

Aggregations

DefaultStageMetrics (co.cask.cdap.etl.common.DefaultStageMetrics)10 StageMetrics (co.cask.cdap.etl.api.StageMetrics)8 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)5 TrackedTransform (co.cask.cdap.etl.common.TrackedTransform)4 Alert (co.cask.cdap.etl.api.Alert)3 AlertPublisher (co.cask.cdap.etl.api.AlertPublisher)3 AlertPublisherContext (co.cask.cdap.etl.api.AlertPublisherContext)3 Transformation (co.cask.cdap.etl.api.Transformation)3 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)3 DefaultAlertPublisherContext (co.cask.cdap.etl.common.DefaultAlertPublisherContext)3 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)3 TrackedIterator (co.cask.cdap.etl.common.TrackedIterator)3 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)2 BatchJoinerRuntimeContext (co.cask.cdap.etl.api.batch.BatchJoinerRuntimeContext)2 BatchRuntimeContext (co.cask.cdap.etl.api.batch.BatchRuntimeContext)2 PostAction (co.cask.cdap.etl.api.batch.PostAction)2 DefaultEmitter (co.cask.cdap.etl.common.DefaultEmitter)2 NoopStageStatisticsCollector (co.cask.cdap.etl.common.NoopStageStatisticsCollector)2 TransformDetail (co.cask.cdap.etl.common.TransformDetail)2 TxLookupProvider (co.cask.cdap.etl.common.TxLookupProvider)2