use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.
the class ETLWorker method run.
@Override
public void run() {
final SourceState currentState = new SourceState();
final SourceState nextState = new SourceState();
final Map<String, List<Object>> dataToSink = new HashMap<>();
boolean hasData = false;
final Map<String, List<InvalidEntry>> transformIdToErrorRecords = intializeTransformIdToErrorsList();
final WorkerContext context = getContext();
Set<String> transformErrorsWithoutDataset = Sets.newHashSet();
// Fetch SourceState from State Table.
// Only required at the beginning since we persist the state if there is a change.
Transactionals.execute(context, new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
byte[] stateBytes = stateTable.read(stateStoreKeyBytes);
if (stateBytes != null) {
SourceState state = GSON.fromJson(Bytes.toString(stateBytes), SourceState.class);
currentState.setState(state);
}
}
});
DefaultEmitter<Object> sourceEmitter = new DefaultEmitter<>();
TrackedEmitter<Object> trackedSourceEmitter = new TrackedEmitter<>(sourceEmitter, new DefaultStageMetrics(metrics, sourceStageName), TrackedTransform.RECORDS_OUT, context.getDataTracer(sourceStageName));
while (!stopped) {
// Invoke poll method of the source to fetch data
try {
SourceState newState = source.poll(trackedSourceEmitter, new SourceState(currentState));
if (newState != null) {
nextState.setState(newState);
}
} catch (Exception e) {
// Continue since the source threw an exception. No point in processing records and state is not changed.
LOG.warn("Exception thrown during polling of Source for data", e);
sourceEmitter.reset();
continue;
}
// to be persisted in the sink.
for (Object sourceData : sourceEmitter.getEntries()) {
try {
TransformResponse transformResponse = transformExecutor.runOneIteration(sourceData);
for (Map.Entry<String, Collection<Object>> transformedValues : transformResponse.getSinksResults().entrySet()) {
dataToSink.put(transformedValues.getKey(), new ArrayList<>());
Iterator emitterIterator = transformedValues.getValue().iterator();
while (emitterIterator.hasNext()) {
if (!hasData) {
hasData = true;
}
dataToSink.get(transformedValues.getKey()).add(emitterIterator.next());
}
}
for (Map.Entry<String, Collection<InvalidEntry<Object>>> transformErrorsEntry : transformResponse.getMapTransformIdToErrorEmitter().entrySet()) {
if (!transformErrorsWithoutDataset.contains(transformErrorsEntry.getKey())) {
if (!tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
transformErrorsWithoutDataset.add(transformErrorsEntry.getKey());
LOG.warn("Error records were emitted in transform {}, " + "but error dataset is not configured for this transform", transformErrorsEntry.getKey());
}
if (tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
// add the errors
if (!hasData && transformErrorsEntry.getValue().size() > 0) {
hasData = true;
}
transformIdToErrorRecords.get(transformErrorsEntry.getKey()).addAll(transformErrorsEntry.getValue());
}
}
}
} catch (Exception e) {
LOG.warn("Exception thrown while processing data {}", sourceData, e);
}
}
sourceEmitter.reset();
// Start a Transaction if there is data to persist or if the Source state has changed.
try {
if (hasData || (!nextState.equals(currentState))) {
getContext().execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
// Invoke the sink's write method if there is any object to be written.
if (!dataToSink.isEmpty()) {
DefaultDataWriter defaultDataWriter = new DefaultDataWriter(getContext(), context);
for (Map.Entry<String, List<Object>> sinkEntry : dataToSink.entrySet()) {
sinks.get(sinkEntry.getKey()).write(sinkEntry.getValue(), defaultDataWriter);
}
}
for (Map.Entry<String, List<InvalidEntry>> errorRecordEntry : transformIdToErrorRecords.entrySet()) {
String transformId = errorRecordEntry.getKey();
final String datasetName = tranformIdToDatasetName.get(transformId);
Table errorTable = context.getDataset(datasetName);
long timeInMillis = System.currentTimeMillis();
byte[] currentTime = Bytes.toBytes(timeInMillis);
String transformIdentifier = appName + SEPARATOR + transformId;
for (InvalidEntry invalidEntry : errorRecordEntry.getValue()) {
// using random uuid as we want to write each record uniquely,
// but we are not concerned about the uuid while scanning later.
byte[] rowKey = Bytes.concat(currentTime, Bytes.toBytes(transformIdentifier), Bytes.toBytes(UUID.randomUUID()));
Put errorPut = constructErrorPut(rowKey, invalidEntry, timeInMillis);
errorTable.write(rowKey, errorPut);
}
}
// Persist nextState if it is different from currentState
if (!nextState.equals(currentState)) {
KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
stateTable.write(stateStoreKey, GSON.toJson(nextState));
}
// after running one iteration and succesfully writing to sinks and error datasets, reset the emitters.
transformExecutor.resetEmitter();
}
});
// Update the in-memory copy of the state only if the transaction succeeded.
currentState.setState(nextState);
}
} catch (Exception e) {
LOG.warn("Exception thrown during persisting of data", e);
} finally {
// Clear the persisted sink data (in case transaction failure occurred, we will poll the source with old state)
hasData = false;
dataToSink.clear();
for (List<InvalidEntry> invalidEntryList : transformIdToErrorRecords.values()) {
invalidEntryList.clear();
}
}
}
}
use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.
the class MapReduceTransformExecutorFactory method getMultiOutputTransform.
private <IN, ERROR> TrackedMultiOutputTransform<IN, ERROR> getMultiOutputTransform(StageSpec stageSpec) throws Exception {
String stageName = stageSpec.getName();
DefaultMacroEvaluator macroEvaluator = new DefaultMacroEvaluator(arguments, taskContext.getLogicalStartTime(), taskContext, taskContext.getNamespace());
SplitterTransform<IN, ERROR> splitterTransform = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
TransformContext transformContext = createRuntimeContext(stageSpec);
splitterTransform.initialize(transformContext);
StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
TaskAttemptContext taskAttemptContext = (TaskAttemptContext) taskContext.getHadoopContext();
StageStatisticsCollector collector = isPipelineContainsCondition ? new MapReduceStageStatisticsCollector(stageName, taskAttemptContext) : new NoopStageStatisticsCollector();
return new TrackedMultiOutputTransform<>(splitterTransform, stageMetrics, taskContext.getDataTracer(stageName), collector);
}
use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.
the class MapReduceTransformExecutorFactory method getTransformation.
@SuppressWarnings("unchecked")
@Override
protected TrackedTransform getTransformation(StageInfo stageInfo) throws Exception {
DefaultMacroEvaluator macroEvaluator = new DefaultMacroEvaluator(taskContext.getWorkflowToken(), taskContext.getRuntimeArguments(), taskContext.getLogicalStartTime(), taskContext, taskContext.getNamespace());
String stageName = stageInfo.getName();
String pluginType = stageInfo.getPluginType();
StageMetrics stageMetrics = new DefaultStageMetrics(metrics, stageName);
if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
BatchAggregator<?, ?, ?> batchAggregator = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchRuntimeContext runtimeContext = createRuntimeContext(stageInfo);
batchAggregator.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName));
} else {
return getTrackedAggregateStep(new ReducerAggregatorTransformation(batchAggregator, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName));
}
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
BatchJoiner<?, ?, ?> batchJoiner = pluginInstantiator.newPluginInstance(stageName, macroEvaluator);
BatchJoinerRuntimeContext runtimeContext = createRuntimeContext(stageInfo);
batchJoiner.initialize(runtimeContext);
if (isMapPhase) {
return getTrackedEmitKeyStep(new MapperJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName), stageMetrics, taskContext.getDataTracer(stageName));
} else {
return getTrackedMergeStep(new ReducerJoinerTransformation(batchJoiner, mapOutputKeyClassName, mapOutputValClassName, runtimeContext.getInputSchemas().size()), stageMetrics, taskContext.getDataTracer(stageName));
}
}
Transformation transformation = getInitializedTransformation(stageInfo);
boolean isLimitingSource = taskContext.getDataTracer(stageName).isEnabled() && BatchSource.PLUGIN_TYPE.equals(pluginType) && isMapPhase;
return new TrackedTransform(isLimitingSource ? new LimitingTransform(transformation, numberOfRecordsPreview) : transformation, stageMetrics, taskContext.getDataTracer(stageName));
}
use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.
the class ETLWorker method initializeTransforms.
private void initializeTransforms(WorkerContext context, Map<String, TransformDetail> transformDetailMap, PipelinePhase pipeline) throws Exception {
Set<StageInfo> transformInfos = pipeline.getStagesOfType(Transform.PLUGIN_TYPE);
Preconditions.checkArgument(transformInfos != null);
tranformIdToDatasetName = new HashMap<>(transformInfos.size());
for (StageInfo transformInfo : transformInfos) {
String transformName = transformInfo.getName();
try {
Transform<?, ?> transform = context.newPluginInstance(transformName);
;
transform = new WrappedTransform<>(transform, Caller.DEFAULT);
WorkerRealtimeContext transformContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), transformInfo);
LOG.debug("Transform Class : {}", transform.getClass().getName());
transform.initialize(transformContext);
StageMetrics stageMetrics = new DefaultStageMetrics(metrics, transformName);
transformDetailMap.put(transformName, new TransformDetail(new TrackedTransform<>(transform, stageMetrics, context.getDataTracer(transformName)), pipeline.getStageOutputs(transformName)));
if (transformInfo.getErrorDatasetName() != null) {
tranformIdToDatasetName.put(transformName, transformInfo.getErrorDatasetName());
}
} catch (InstantiationException e) {
LOG.error("Unable to instantiate Transform", e);
Throwables.propagate(e);
}
}
}
use of co.cask.cdap.etl.common.DefaultStageMetrics in project cdap by caskdata.
the class ETLWorker method initializeSinks.
@SuppressWarnings("unchecked")
private void initializeSinks(WorkerContext context, Map<String, TransformDetail> transformationMap, PipelinePhase pipeline) throws Exception {
Set<StageInfo> sinkInfos = pipeline.getStagesOfType(RealtimeSink.PLUGIN_TYPE);
sinks = new HashMap<>(sinkInfos.size());
for (StageInfo sinkInfo : sinkInfos) {
String sinkName = sinkInfo.getName();
RealtimeSink sink = context.newPluginInstance(sinkName);
sink = new LoggedRealtimeSink(sinkName, sink);
WorkerRealtimeContext sinkContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), sinkInfo);
LOG.debug("Sink Class : {}", sink.getClass().getName());
sink.initialize(sinkContext);
sink = new TrackedRealtimeSink(sink, new DefaultStageMetrics(metrics, sinkName));
Transformation identityTransformation = new Transformation() {
@Override
public void transform(Object input, Emitter emitter) throws Exception {
emitter.emit(input);
}
};
// we use identity transformation to simplify executing transformation in pipeline (similar to ETLMapreduce),
// since we want to emit metrics during write to sink and not during this transformation, we use NoOpMetrics.
TrackedTransform trackedTransform = new TrackedTransform(identityTransformation, new DefaultStageMetrics(metrics, sinkName), TrackedTransform.RECORDS_IN, null, context.getDataTracer(sinkName));
transformationMap.put(sinkInfo.getName(), new TransformDetail(trackedTransform, new HashSet<String>()));
sinks.put(sinkInfo.getName(), sink);
}
}
Aggregations