Search in sources :

Example 1 with SourceState

use of co.cask.cdap.etl.api.realtime.SourceState in project cdap by caskdata.

the class ETLWorker method run.

@Override
public void run() {
    final SourceState currentState = new SourceState();
    final SourceState nextState = new SourceState();
    final Map<String, List<Object>> dataToSink = new HashMap<>();
    boolean hasData = false;
    final Map<String, List<InvalidEntry>> transformIdToErrorRecords = intializeTransformIdToErrorsList();
    final WorkerContext context = getContext();
    Set<String> transformErrorsWithoutDataset = Sets.newHashSet();
    // Fetch SourceState from State Table.
    // Only required at the beginning since we persist the state if there is a change.
    Transactionals.execute(context, new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
            byte[] stateBytes = stateTable.read(stateStoreKeyBytes);
            if (stateBytes != null) {
                SourceState state = GSON.fromJson(Bytes.toString(stateBytes), SourceState.class);
                currentState.setState(state);
            }
        }
    });
    DefaultEmitter<Object> sourceEmitter = new DefaultEmitter<>();
    TrackedEmitter<Object> trackedSourceEmitter = new TrackedEmitter<>(sourceEmitter, new DefaultStageMetrics(metrics, sourceStageName), TrackedTransform.RECORDS_OUT, context.getDataTracer(sourceStageName));
    while (!stopped) {
        // Invoke poll method of the source to fetch data
        try {
            SourceState newState = source.poll(trackedSourceEmitter, new SourceState(currentState));
            if (newState != null) {
                nextState.setState(newState);
            }
        } catch (Exception e) {
            // Continue since the source threw an exception. No point in processing records and state is not changed.
            LOG.warn("Exception thrown during polling of Source for data", e);
            sourceEmitter.reset();
            continue;
        }
        // to be persisted in the sink.
        for (Object sourceData : sourceEmitter.getEntries()) {
            try {
                TransformResponse transformResponse = transformExecutor.runOneIteration(sourceData);
                for (Map.Entry<String, Collection<Object>> transformedValues : transformResponse.getSinksResults().entrySet()) {
                    dataToSink.put(transformedValues.getKey(), new ArrayList<>());
                    Iterator emitterIterator = transformedValues.getValue().iterator();
                    while (emitterIterator.hasNext()) {
                        if (!hasData) {
                            hasData = true;
                        }
                        dataToSink.get(transformedValues.getKey()).add(emitterIterator.next());
                    }
                }
                for (Map.Entry<String, Collection<InvalidEntry<Object>>> transformErrorsEntry : transformResponse.getMapTransformIdToErrorEmitter().entrySet()) {
                    if (!transformErrorsWithoutDataset.contains(transformErrorsEntry.getKey())) {
                        if (!tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
                            transformErrorsWithoutDataset.add(transformErrorsEntry.getKey());
                            LOG.warn("Error records were emitted in transform {}, " + "but error dataset is not configured for this transform", transformErrorsEntry.getKey());
                        }
                        if (tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
                            // add the errors
                            if (!hasData && transformErrorsEntry.getValue().size() > 0) {
                                hasData = true;
                            }
                            transformIdToErrorRecords.get(transformErrorsEntry.getKey()).addAll(transformErrorsEntry.getValue());
                        }
                    }
                }
            } catch (Exception e) {
                LOG.warn("Exception thrown while processing data {}", sourceData, e);
            }
        }
        sourceEmitter.reset();
        // Start a Transaction if there is data to persist or if the Source state has changed.
        try {
            if (hasData || (!nextState.equals(currentState))) {
                getContext().execute(new TxRunnable() {

                    @Override
                    public void run(DatasetContext context) throws Exception {
                        // Invoke the sink's write method if there is any object to be written.
                        if (!dataToSink.isEmpty()) {
                            DefaultDataWriter defaultDataWriter = new DefaultDataWriter(getContext(), context);
                            for (Map.Entry<String, List<Object>> sinkEntry : dataToSink.entrySet()) {
                                sinks.get(sinkEntry.getKey()).write(sinkEntry.getValue(), defaultDataWriter);
                            }
                        }
                        for (Map.Entry<String, List<InvalidEntry>> errorRecordEntry : transformIdToErrorRecords.entrySet()) {
                            String transformId = errorRecordEntry.getKey();
                            final String datasetName = tranformIdToDatasetName.get(transformId);
                            Table errorTable = context.getDataset(datasetName);
                            long timeInMillis = System.currentTimeMillis();
                            byte[] currentTime = Bytes.toBytes(timeInMillis);
                            String transformIdentifier = appName + SEPARATOR + transformId;
                            for (InvalidEntry invalidEntry : errorRecordEntry.getValue()) {
                                // using random uuid as we want to write each record uniquely,
                                // but we are not concerned about the uuid while scanning later.
                                byte[] rowKey = Bytes.concat(currentTime, Bytes.toBytes(transformIdentifier), Bytes.toBytes(UUID.randomUUID()));
                                Put errorPut = constructErrorPut(rowKey, invalidEntry, timeInMillis);
                                errorTable.write(rowKey, errorPut);
                            }
                        }
                        // Persist nextState if it is different from currentState
                        if (!nextState.equals(currentState)) {
                            KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
                            stateTable.write(stateStoreKey, GSON.toJson(nextState));
                        }
                        // after running one iteration and succesfully writing to sinks and error datasets, reset the emitters.
                        transformExecutor.resetEmitter();
                    }
                });
                // Update the in-memory copy of the state only if the transaction succeeded.
                currentState.setState(nextState);
            }
        } catch (Exception e) {
            LOG.warn("Exception thrown during persisting of data", e);
        } finally {
            // Clear the persisted sink data (in case transaction failure occurred, we will poll the source with old state)
            hasData = false;
            dataToSink.clear();
            for (List<InvalidEntry> invalidEntryList : transformIdToErrorRecords.values()) {
                invalidEntryList.clear();
            }
        }
    }
}
Also used : DefaultEmitter(co.cask.cdap.etl.common.DefaultEmitter) HashMap(java.util.HashMap) InvalidEntry(co.cask.cdap.etl.api.InvalidEntry) TxRunnable(co.cask.cdap.api.TxRunnable) TrackedEmitter(co.cask.cdap.etl.common.TrackedEmitter) CloseableIterator(co.cask.cdap.api.dataset.lib.CloseableIterator) Iterator(java.util.Iterator) List(java.util.List) ArrayList(java.util.ArrayList) DatasetContext(co.cask.cdap.api.data.DatasetContext) InvalidEntry(co.cask.cdap.etl.api.InvalidEntry) SourceState(co.cask.cdap.etl.api.realtime.SourceState) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) IOException(java.io.IOException) Put(co.cask.cdap.api.dataset.table.Put) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Collection(java.util.Collection) TransformResponse(co.cask.cdap.etl.common.TransformResponse) WorkerContext(co.cask.cdap.api.worker.WorkerContext) Map(java.util.Map) HashMap(java.util.HashMap) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics)

Aggregations

TxRunnable (co.cask.cdap.api.TxRunnable)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 CloseableIterator (co.cask.cdap.api.dataset.lib.CloseableIterator)1 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)1 Put (co.cask.cdap.api.dataset.table.Put)1 Table (co.cask.cdap.api.dataset.table.Table)1 WorkerContext (co.cask.cdap.api.worker.WorkerContext)1 InvalidEntry (co.cask.cdap.etl.api.InvalidEntry)1 SourceState (co.cask.cdap.etl.api.realtime.SourceState)1 DefaultEmitter (co.cask.cdap.etl.common.DefaultEmitter)1 DefaultStageMetrics (co.cask.cdap.etl.common.DefaultStageMetrics)1 TrackedEmitter (co.cask.cdap.etl.common.TrackedEmitter)1 TransformResponse (co.cask.cdap.etl.common.TransformResponse)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 HashMap (java.util.HashMap)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Map (java.util.Map)1