Search in sources :

Example 16 with DatasetContext

use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.

the class ETLWorker method run.

@Override
public void run() {
    final SourceState currentState = new SourceState();
    final SourceState nextState = new SourceState();
    final Map<String, List<Object>> dataToSink = new HashMap<>();
    boolean hasData = false;
    final Map<String, List<InvalidEntry>> transformIdToErrorRecords = intializeTransformIdToErrorsList();
    final WorkerContext context = getContext();
    Set<String> transformErrorsWithoutDataset = Sets.newHashSet();
    // Fetch SourceState from State Table.
    // Only required at the beginning since we persist the state if there is a change.
    Transactionals.execute(context, new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
            byte[] stateBytes = stateTable.read(stateStoreKeyBytes);
            if (stateBytes != null) {
                SourceState state = GSON.fromJson(Bytes.toString(stateBytes), SourceState.class);
                currentState.setState(state);
            }
        }
    });
    DefaultEmitter<Object> sourceEmitter = new DefaultEmitter<>();
    TrackedEmitter<Object> trackedSourceEmitter = new TrackedEmitter<>(sourceEmitter, new DefaultStageMetrics(metrics, sourceStageName), TrackedTransform.RECORDS_OUT, context.getDataTracer(sourceStageName));
    while (!stopped) {
        // Invoke poll method of the source to fetch data
        try {
            SourceState newState = source.poll(trackedSourceEmitter, new SourceState(currentState));
            if (newState != null) {
                nextState.setState(newState);
            }
        } catch (Exception e) {
            // Continue since the source threw an exception. No point in processing records and state is not changed.
            LOG.warn("Exception thrown during polling of Source for data", e);
            sourceEmitter.reset();
            continue;
        }
        // to be persisted in the sink.
        for (Object sourceData : sourceEmitter.getEntries()) {
            try {
                TransformResponse transformResponse = transformExecutor.runOneIteration(sourceData);
                for (Map.Entry<String, Collection<Object>> transformedValues : transformResponse.getSinksResults().entrySet()) {
                    dataToSink.put(transformedValues.getKey(), new ArrayList<>());
                    Iterator emitterIterator = transformedValues.getValue().iterator();
                    while (emitterIterator.hasNext()) {
                        if (!hasData) {
                            hasData = true;
                        }
                        dataToSink.get(transformedValues.getKey()).add(emitterIterator.next());
                    }
                }
                for (Map.Entry<String, Collection<InvalidEntry<Object>>> transformErrorsEntry : transformResponse.getMapTransformIdToErrorEmitter().entrySet()) {
                    if (!transformErrorsWithoutDataset.contains(transformErrorsEntry.getKey())) {
                        if (!tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
                            transformErrorsWithoutDataset.add(transformErrorsEntry.getKey());
                            LOG.warn("Error records were emitted in transform {}, " + "but error dataset is not configured for this transform", transformErrorsEntry.getKey());
                        }
                        if (tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
                            // add the errors
                            if (!hasData && transformErrorsEntry.getValue().size() > 0) {
                                hasData = true;
                            }
                            transformIdToErrorRecords.get(transformErrorsEntry.getKey()).addAll(transformErrorsEntry.getValue());
                        }
                    }
                }
            } catch (Exception e) {
                LOG.warn("Exception thrown while processing data {}", sourceData, e);
            }
        }
        sourceEmitter.reset();
        // Start a Transaction if there is data to persist or if the Source state has changed.
        try {
            if (hasData || (!nextState.equals(currentState))) {
                getContext().execute(new TxRunnable() {

                    @Override
                    public void run(DatasetContext context) throws Exception {
                        // Invoke the sink's write method if there is any object to be written.
                        if (!dataToSink.isEmpty()) {
                            DefaultDataWriter defaultDataWriter = new DefaultDataWriter(getContext(), context);
                            for (Map.Entry<String, List<Object>> sinkEntry : dataToSink.entrySet()) {
                                sinks.get(sinkEntry.getKey()).write(sinkEntry.getValue(), defaultDataWriter);
                            }
                        }
                        for (Map.Entry<String, List<InvalidEntry>> errorRecordEntry : transformIdToErrorRecords.entrySet()) {
                            String transformId = errorRecordEntry.getKey();
                            final String datasetName = tranformIdToDatasetName.get(transformId);
                            Table errorTable = context.getDataset(datasetName);
                            long timeInMillis = System.currentTimeMillis();
                            byte[] currentTime = Bytes.toBytes(timeInMillis);
                            String transformIdentifier = appName + SEPARATOR + transformId;
                            for (InvalidEntry invalidEntry : errorRecordEntry.getValue()) {
                                // using random uuid as we want to write each record uniquely,
                                // but we are not concerned about the uuid while scanning later.
                                byte[] rowKey = Bytes.concat(currentTime, Bytes.toBytes(transformIdentifier), Bytes.toBytes(UUID.randomUUID()));
                                Put errorPut = constructErrorPut(rowKey, invalidEntry, timeInMillis);
                                errorTable.write(rowKey, errorPut);
                            }
                        }
                        // Persist nextState if it is different from currentState
                        if (!nextState.equals(currentState)) {
                            KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
                            stateTable.write(stateStoreKey, GSON.toJson(nextState));
                        }
                        // after running one iteration and succesfully writing to sinks and error datasets, reset the emitters.
                        transformExecutor.resetEmitter();
                    }
                });
                // Update the in-memory copy of the state only if the transaction succeeded.
                currentState.setState(nextState);
            }
        } catch (Exception e) {
            LOG.warn("Exception thrown during persisting of data", e);
        } finally {
            // Clear the persisted sink data (in case transaction failure occurred, we will poll the source with old state)
            hasData = false;
            dataToSink.clear();
            for (List<InvalidEntry> invalidEntryList : transformIdToErrorRecords.values()) {
                invalidEntryList.clear();
            }
        }
    }
}
Also used : DefaultEmitter(co.cask.cdap.etl.common.DefaultEmitter) HashMap(java.util.HashMap) InvalidEntry(co.cask.cdap.etl.api.InvalidEntry) TxRunnable(co.cask.cdap.api.TxRunnable) TrackedEmitter(co.cask.cdap.etl.common.TrackedEmitter) CloseableIterator(co.cask.cdap.api.dataset.lib.CloseableIterator) Iterator(java.util.Iterator) List(java.util.List) ArrayList(java.util.ArrayList) DatasetContext(co.cask.cdap.api.data.DatasetContext) InvalidEntry(co.cask.cdap.etl.api.InvalidEntry) SourceState(co.cask.cdap.etl.api.realtime.SourceState) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Table(co.cask.cdap.api.dataset.table.Table) IOException(java.io.IOException) Put(co.cask.cdap.api.dataset.table.Put) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) Collection(java.util.Collection) TransformResponse(co.cask.cdap.etl.common.TransformResponse) WorkerContext(co.cask.cdap.api.worker.WorkerContext) Map(java.util.Map) HashMap(java.util.HashMap) DefaultStageMetrics(co.cask.cdap.etl.common.DefaultStageMetrics)

Example 17 with DatasetContext

use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.

the class ArtifactStore method getArtifact.

/**
   * Get information about the given artifact.
   *
   * @param artifactId the artifact to get
   * @return information about the artifact
   * @throws ArtifactNotFoundException if the given artifact does not exist
   * @throws IOException if there was an exception reading the artifact information from the metastore
   */
public ArtifactDetail getArtifact(final Id.Artifact artifactId) throws ArtifactNotFoundException, IOException {
    try {
        final ArtifactData artifactData = Transactions.execute(transactional, new TxCallable<ArtifactData>() {

            @Override
            public ArtifactData call(DatasetContext context) throws Exception {
                ArtifactCell artifactCell = new ArtifactCell(artifactId);
                byte[] value = getMetaTable(context).get(artifactCell.rowkey, artifactCell.column);
                if (value == null) {
                    throw new ArtifactNotFoundException(artifactId.toEntityId());
                }
                return GSON.fromJson(Bytes.toString(value), ArtifactData.class);
            }
        });
        Location artifactLocation = impersonator.doAs(artifactId.getNamespace().toEntityId(), new Callable<Location>() {

            @Override
            public Location call() throws Exception {
                return Locations.getLocationFromAbsolutePath(locationFactory, artifactData.getLocationPath());
            }
        });
        return new ArtifactDetail(new ArtifactDescriptor(artifactId.toArtifactId(), artifactLocation), artifactData.meta);
    } catch (TransactionFailureException e) {
        throw Transactions.propagate(e, IOException.class, ArtifactNotFoundException.class);
    } catch (Exception e) {
        throw Throwables.propagate(e);
    }
}
Also used : IOException(java.io.IOException) TransactionFailureException(org.apache.tephra.TransactionFailureException) ArtifactNotFoundException(co.cask.cdap.common.ArtifactNotFoundException) ArtifactAlreadyExistsException(co.cask.cdap.common.ArtifactAlreadyExistsException) TransactionConflictException(org.apache.tephra.TransactionConflictException) PluginNotExistsException(co.cask.cdap.internal.app.runtime.plugin.PluginNotExistsException) ArtifactRangeNotFoundException(co.cask.cdap.common.ArtifactRangeNotFoundException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException) IOException(java.io.IOException) TransactionFailureException(org.apache.tephra.TransactionFailureException) DatasetContext(co.cask.cdap.api.data.DatasetContext) ArtifactNotFoundException(co.cask.cdap.common.ArtifactNotFoundException) Location(org.apache.twill.filesystem.Location)

Example 18 with DatasetContext

use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.

the class MapReduceRuntimeService method destroy.

/**
   * Calls the destroy method of {@link ProgramLifecycle}.
   */
private void destroy(final boolean succeeded, final String failureInfo) throws Exception {
    // if any exception happens during output committing, we want the MapReduce to fail.
    // for that to happen it is not sufficient to set the status to failed, we have to throw an exception,
    // otherwise the shutdown completes successfully and the completed() callback is called.
    // thus: remember the exception and throw it at the end.
    final AtomicReference<Exception> failureCause = new AtomicReference<>();
    // TODO (CDAP-1952): this should be done in the output committer, to make the M/R fail if addPartition fails
    try {
        context.execute(new TxRunnable() {

            @Override
            public void run(DatasetContext ctxt) throws Exception {
                ClassLoader oldClassLoader = ClassLoaders.setContextClassLoader(job.getConfiguration().getClassLoader());
                try {
                    for (Map.Entry<String, ProvidedOutput> output : context.getOutputs().entrySet()) {
                        commitOutput(succeeded, output.getKey(), output.getValue().getOutputFormatProvider(), failureCause);
                        if (succeeded && failureCause.get() != null) {
                            // mapreduce was successful but this output committer failed: call onFailure() for all committers
                            for (ProvidedOutput toFail : context.getOutputs().values()) {
                                commitOutput(false, toFail.getAlias(), toFail.getOutputFormatProvider(), failureCause);
                            }
                            break;
                        }
                    }
                    // if there was a failure, we must throw an exception to fail the transaction
                    // this will roll back all the outputs and also make sure that postCommit() is not called
                    // throwing the failure cause: it will be wrapped in a TxFailure and handled in the outer catch()
                    Exception cause = failureCause.get();
                    if (cause != null) {
                        failureCause.set(null);
                        throw cause;
                    }
                } finally {
                    ClassLoaders.setContextClassLoader(oldClassLoader);
                }
            }
        });
    } catch (TransactionFailureException e) {
        LOG.error("Transaction failure when committing dataset outputs", e);
        if (failureCause.get() != null) {
            failureCause.get().addSuppressed(e);
        } else {
            failureCause.set(e);
        }
    }
    final boolean success = succeeded && failureCause.get() == null;
    context.setState(getProgramState(success, failureInfo));
    final TransactionControl txControl = mapReduce instanceof ProgramLifecycle ? Transactions.getTransactionControl(TransactionControl.IMPLICIT, MapReduce.class, mapReduce, "destroy") : TransactionControl.IMPLICIT;
    try {
        if (TransactionControl.IMPLICIT == txControl) {
            context.execute(new TxRunnable() {

                @Override
                public void run(DatasetContext context) throws Exception {
                    doDestroy(success);
                }
            });
        } else {
            doDestroy(success);
        }
    } catch (Throwable e) {
        if (e instanceof TransactionFailureException && e.getCause() != null && !(e instanceof TransactionConflictException)) {
            e = e.getCause();
        }
        LOG.warn("Error executing the destroy method of the MapReduce program {}", context.getProgram().getName(), e);
    }
    // this is needed to make the run fail if there was an exception. See comment at beginning of this method
    if (failureCause.get() != null) {
        throw failureCause.get();
    }
}
Also used : ProgramLifecycle(co.cask.cdap.api.ProgramLifecycle) TransactionConflictException(org.apache.tephra.TransactionConflictException) AtomicReference(java.util.concurrent.atomic.AtomicReference) ProvidedOutput(co.cask.cdap.internal.app.runtime.batch.dataset.output.ProvidedOutput) ProvisionException(com.google.inject.ProvisionException) IOException(java.io.IOException) TransactionFailureException(org.apache.tephra.TransactionFailureException) URISyntaxException(java.net.URISyntaxException) TransactionConflictException(org.apache.tephra.TransactionConflictException) AbstractMapReduce(co.cask.cdap.api.mapreduce.AbstractMapReduce) MapReduce(co.cask.cdap.api.mapreduce.MapReduce) JarEntry(java.util.jar.JarEntry) TransactionFailureException(org.apache.tephra.TransactionFailureException) TxRunnable(co.cask.cdap.api.TxRunnable) TransactionControl(co.cask.cdap.api.annotation.TransactionControl) WeakReferenceDelegatorClassLoader(co.cask.cdap.common.lang.WeakReferenceDelegatorClassLoader) CombineClassLoader(co.cask.cdap.common.lang.CombineClassLoader) DatasetContext(co.cask.cdap.api.data.DatasetContext)

Example 19 with DatasetContext

use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.

the class ETLWorker method initialize.

@Override
public void initialize(final WorkerContext context) throws Exception {
    if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    super.initialize(context);
    Map<String, String> properties = context.getSpecification().getProperties();
    appName = context.getApplicationSpecification().getName();
    Preconditions.checkArgument(properties.containsKey(Constants.PIPELINEID));
    Preconditions.checkArgument(properties.containsKey(UNIQUE_ID));
    String uniqueId = properties.get(UNIQUE_ID);
    // Each worker instance should have its own unique state.
    final String appName = context.getApplicationSpecification().getName();
    stateStoreKey = String.format("%s%s%s%s%s", appName, SEPARATOR, uniqueId, SEPARATOR, context.getInstanceId());
    stateStoreKeyBytes = Bytes.toBytes(stateStoreKey);
    Transactionals.execute(getContext(), new TxRunnable() {

        @Override
        public void run(DatasetContext dsContext) throws Exception {
            KeyValueTable stateTable = dsContext.getDataset(ETLRealtimeApplication.STATE_TABLE);
            byte[] startKey = Bytes.toBytes(String.format("%s%s", appName, SEPARATOR));
            // Scan the table for appname: prefixes and remove rows which doesn't match the unique id of this application.
            try (CloseableIterator<KeyValue<byte[], byte[]>> rows = stateTable.scan(startKey, Bytes.stopKeyForPrefix(startKey))) {
                while (rows.hasNext()) {
                    KeyValue<byte[], byte[]> row = rows.next();
                    if (Bytes.compareTo(stateStoreKeyBytes, row.getKey()) != 0) {
                        stateTable.delete(row.getKey());
                    }
                }
            }
        }
    }, Exception.class);
    PipelinePhase pipeline = GSON.fromJson(properties.get(Constants.PIPELINEID), PipelinePhase.class);
    Map<String, TransformDetail> transformationMap = new HashMap<>();
    initializeSource(context, pipeline);
    initializeTransforms(context, transformationMap, pipeline);
    initializeSinks(context, transformationMap, pipeline);
    Set<String> startStages = new HashSet<>();
    startStages.addAll(pipeline.getStageOutputs(sourceStageName));
    transformExecutor = new TransformExecutor(transformationMap, startStages);
}
Also used : CloseableIterator(co.cask.cdap.api.dataset.lib.CloseableIterator) KeyValue(co.cask.cdap.api.dataset.lib.KeyValue) HashMap(java.util.HashMap) IOException(java.io.IOException) TxRunnable(co.cask.cdap.api.TxRunnable) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) TransformDetail(co.cask.cdap.etl.common.TransformDetail) TransformExecutor(co.cask.cdap.etl.common.TransformExecutor) DatasetContext(co.cask.cdap.api.data.DatasetContext) HashSet(java.util.HashSet)

Example 20 with DatasetContext

use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.

the class DefaultNamespaceStore method update.

@Override
public void update(final NamespaceMeta metadata) {
    Preconditions.checkArgument(metadata != null, "Namespace metadata cannot be null.");
    Transactions.executeUnchecked(transactional, new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            NamespaceMDS mds = getNamespaceMDS(context);
            NamespaceMeta existing = mds.get(metadata.getNamespaceId());
            if (existing != null) {
                mds.create(metadata);
            }
        }
    });
}
Also used : TxRunnable(co.cask.cdap.api.TxRunnable) NamespaceMeta(co.cask.cdap.proto.NamespaceMeta) DatasetContext(co.cask.cdap.api.data.DatasetContext) IOException(java.io.IOException) DatasetManagementException(co.cask.cdap.api.dataset.DatasetManagementException)

Aggregations

DatasetContext (co.cask.cdap.api.data.DatasetContext)43 TxRunnable (co.cask.cdap.api.TxRunnable)37 IOException (java.io.IOException)22 TransactionFailureException (org.apache.tephra.TransactionFailureException)22 DatasetManagementException (co.cask.cdap.api.dataset.DatasetManagementException)15 TransactionConflictException (org.apache.tephra.TransactionConflictException)15 ApplicationNotFoundException (co.cask.cdap.common.ApplicationNotFoundException)10 ProgramNotFoundException (co.cask.cdap.common.ProgramNotFoundException)10 NoSuchElementException (java.util.NoSuchElementException)10 TransactionNotInProgressException (org.apache.tephra.TransactionNotInProgressException)10 TransactionControl (co.cask.cdap.api.annotation.TransactionControl)8 Table (co.cask.cdap.api.dataset.table.Table)8 ProgramLifecycle (co.cask.cdap.api.ProgramLifecycle)4 ApplicationSpecification (co.cask.cdap.api.app.ApplicationSpecification)4 KeyValueTable (co.cask.cdap.api.dataset.lib.KeyValueTable)4 ForwardingApplicationSpecification (co.cask.cdap.internal.app.ForwardingApplicationSpecification)4 HashMap (java.util.HashMap)4 AtomicReference (java.util.concurrent.atomic.AtomicReference)4 Put (co.cask.cdap.api.dataset.table.Put)3 Map (java.util.Map)3