use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.
the class ETLWorker method run.
@Override
public void run() {
final SourceState currentState = new SourceState();
final SourceState nextState = new SourceState();
final Map<String, List<Object>> dataToSink = new HashMap<>();
boolean hasData = false;
final Map<String, List<InvalidEntry>> transformIdToErrorRecords = intializeTransformIdToErrorsList();
final WorkerContext context = getContext();
Set<String> transformErrorsWithoutDataset = Sets.newHashSet();
// Fetch SourceState from State Table.
// Only required at the beginning since we persist the state if there is a change.
Transactionals.execute(context, new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
byte[] stateBytes = stateTable.read(stateStoreKeyBytes);
if (stateBytes != null) {
SourceState state = GSON.fromJson(Bytes.toString(stateBytes), SourceState.class);
currentState.setState(state);
}
}
});
DefaultEmitter<Object> sourceEmitter = new DefaultEmitter<>();
TrackedEmitter<Object> trackedSourceEmitter = new TrackedEmitter<>(sourceEmitter, new DefaultStageMetrics(metrics, sourceStageName), TrackedTransform.RECORDS_OUT, context.getDataTracer(sourceStageName));
while (!stopped) {
// Invoke poll method of the source to fetch data
try {
SourceState newState = source.poll(trackedSourceEmitter, new SourceState(currentState));
if (newState != null) {
nextState.setState(newState);
}
} catch (Exception e) {
// Continue since the source threw an exception. No point in processing records and state is not changed.
LOG.warn("Exception thrown during polling of Source for data", e);
sourceEmitter.reset();
continue;
}
// to be persisted in the sink.
for (Object sourceData : sourceEmitter.getEntries()) {
try {
TransformResponse transformResponse = transformExecutor.runOneIteration(sourceData);
for (Map.Entry<String, Collection<Object>> transformedValues : transformResponse.getSinksResults().entrySet()) {
dataToSink.put(transformedValues.getKey(), new ArrayList<>());
Iterator emitterIterator = transformedValues.getValue().iterator();
while (emitterIterator.hasNext()) {
if (!hasData) {
hasData = true;
}
dataToSink.get(transformedValues.getKey()).add(emitterIterator.next());
}
}
for (Map.Entry<String, Collection<InvalidEntry<Object>>> transformErrorsEntry : transformResponse.getMapTransformIdToErrorEmitter().entrySet()) {
if (!transformErrorsWithoutDataset.contains(transformErrorsEntry.getKey())) {
if (!tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
transformErrorsWithoutDataset.add(transformErrorsEntry.getKey());
LOG.warn("Error records were emitted in transform {}, " + "but error dataset is not configured for this transform", transformErrorsEntry.getKey());
}
if (tranformIdToDatasetName.containsKey(transformErrorsEntry.getKey()) && !transformErrorsEntry.getValue().isEmpty()) {
// add the errors
if (!hasData && transformErrorsEntry.getValue().size() > 0) {
hasData = true;
}
transformIdToErrorRecords.get(transformErrorsEntry.getKey()).addAll(transformErrorsEntry.getValue());
}
}
}
} catch (Exception e) {
LOG.warn("Exception thrown while processing data {}", sourceData, e);
}
}
sourceEmitter.reset();
// Start a Transaction if there is data to persist or if the Source state has changed.
try {
if (hasData || (!nextState.equals(currentState))) {
getContext().execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
// Invoke the sink's write method if there is any object to be written.
if (!dataToSink.isEmpty()) {
DefaultDataWriter defaultDataWriter = new DefaultDataWriter(getContext(), context);
for (Map.Entry<String, List<Object>> sinkEntry : dataToSink.entrySet()) {
sinks.get(sinkEntry.getKey()).write(sinkEntry.getValue(), defaultDataWriter);
}
}
for (Map.Entry<String, List<InvalidEntry>> errorRecordEntry : transformIdToErrorRecords.entrySet()) {
String transformId = errorRecordEntry.getKey();
final String datasetName = tranformIdToDatasetName.get(transformId);
Table errorTable = context.getDataset(datasetName);
long timeInMillis = System.currentTimeMillis();
byte[] currentTime = Bytes.toBytes(timeInMillis);
String transformIdentifier = appName + SEPARATOR + transformId;
for (InvalidEntry invalidEntry : errorRecordEntry.getValue()) {
// using random uuid as we want to write each record uniquely,
// but we are not concerned about the uuid while scanning later.
byte[] rowKey = Bytes.concat(currentTime, Bytes.toBytes(transformIdentifier), Bytes.toBytes(UUID.randomUUID()));
Put errorPut = constructErrorPut(rowKey, invalidEntry, timeInMillis);
errorTable.write(rowKey, errorPut);
}
}
// Persist nextState if it is different from currentState
if (!nextState.equals(currentState)) {
KeyValueTable stateTable = context.getDataset(ETLRealtimeApplication.STATE_TABLE);
stateTable.write(stateStoreKey, GSON.toJson(nextState));
}
// after running one iteration and succesfully writing to sinks and error datasets, reset the emitters.
transformExecutor.resetEmitter();
}
});
// Update the in-memory copy of the state only if the transaction succeeded.
currentState.setState(nextState);
}
} catch (Exception e) {
LOG.warn("Exception thrown during persisting of data", e);
} finally {
// Clear the persisted sink data (in case transaction failure occurred, we will poll the source with old state)
hasData = false;
dataToSink.clear();
for (List<InvalidEntry> invalidEntryList : transformIdToErrorRecords.values()) {
invalidEntryList.clear();
}
}
}
}
use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.
the class ArtifactStore method getArtifact.
/**
* Get information about the given artifact.
*
* @param artifactId the artifact to get
* @return information about the artifact
* @throws ArtifactNotFoundException if the given artifact does not exist
* @throws IOException if there was an exception reading the artifact information from the metastore
*/
public ArtifactDetail getArtifact(final Id.Artifact artifactId) throws ArtifactNotFoundException, IOException {
try {
final ArtifactData artifactData = Transactions.execute(transactional, new TxCallable<ArtifactData>() {
@Override
public ArtifactData call(DatasetContext context) throws Exception {
ArtifactCell artifactCell = new ArtifactCell(artifactId);
byte[] value = getMetaTable(context).get(artifactCell.rowkey, artifactCell.column);
if (value == null) {
throw new ArtifactNotFoundException(artifactId.toEntityId());
}
return GSON.fromJson(Bytes.toString(value), ArtifactData.class);
}
});
Location artifactLocation = impersonator.doAs(artifactId.getNamespace().toEntityId(), new Callable<Location>() {
@Override
public Location call() throws Exception {
return Locations.getLocationFromAbsolutePath(locationFactory, artifactData.getLocationPath());
}
});
return new ArtifactDetail(new ArtifactDescriptor(artifactId.toArtifactId(), artifactLocation), artifactData.meta);
} catch (TransactionFailureException e) {
throw Transactions.propagate(e, IOException.class, ArtifactNotFoundException.class);
} catch (Exception e) {
throw Throwables.propagate(e);
}
}
use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.
the class MapReduceRuntimeService method destroy.
/**
* Calls the destroy method of {@link ProgramLifecycle}.
*/
private void destroy(final boolean succeeded, final String failureInfo) throws Exception {
// if any exception happens during output committing, we want the MapReduce to fail.
// for that to happen it is not sufficient to set the status to failed, we have to throw an exception,
// otherwise the shutdown completes successfully and the completed() callback is called.
// thus: remember the exception and throw it at the end.
final AtomicReference<Exception> failureCause = new AtomicReference<>();
// TODO (CDAP-1952): this should be done in the output committer, to make the M/R fail if addPartition fails
try {
context.execute(new TxRunnable() {
@Override
public void run(DatasetContext ctxt) throws Exception {
ClassLoader oldClassLoader = ClassLoaders.setContextClassLoader(job.getConfiguration().getClassLoader());
try {
for (Map.Entry<String, ProvidedOutput> output : context.getOutputs().entrySet()) {
commitOutput(succeeded, output.getKey(), output.getValue().getOutputFormatProvider(), failureCause);
if (succeeded && failureCause.get() != null) {
// mapreduce was successful but this output committer failed: call onFailure() for all committers
for (ProvidedOutput toFail : context.getOutputs().values()) {
commitOutput(false, toFail.getAlias(), toFail.getOutputFormatProvider(), failureCause);
}
break;
}
}
// if there was a failure, we must throw an exception to fail the transaction
// this will roll back all the outputs and also make sure that postCommit() is not called
// throwing the failure cause: it will be wrapped in a TxFailure and handled in the outer catch()
Exception cause = failureCause.get();
if (cause != null) {
failureCause.set(null);
throw cause;
}
} finally {
ClassLoaders.setContextClassLoader(oldClassLoader);
}
}
});
} catch (TransactionFailureException e) {
LOG.error("Transaction failure when committing dataset outputs", e);
if (failureCause.get() != null) {
failureCause.get().addSuppressed(e);
} else {
failureCause.set(e);
}
}
final boolean success = succeeded && failureCause.get() == null;
context.setState(getProgramState(success, failureInfo));
final TransactionControl txControl = mapReduce instanceof ProgramLifecycle ? Transactions.getTransactionControl(TransactionControl.IMPLICIT, MapReduce.class, mapReduce, "destroy") : TransactionControl.IMPLICIT;
try {
if (TransactionControl.IMPLICIT == txControl) {
context.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
doDestroy(success);
}
});
} else {
doDestroy(success);
}
} catch (Throwable e) {
if (e instanceof TransactionFailureException && e.getCause() != null && !(e instanceof TransactionConflictException)) {
e = e.getCause();
}
LOG.warn("Error executing the destroy method of the MapReduce program {}", context.getProgram().getName(), e);
}
// this is needed to make the run fail if there was an exception. See comment at beginning of this method
if (failureCause.get() != null) {
throw failureCause.get();
}
}
use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.
the class ETLWorker method initialize.
@Override
public void initialize(final WorkerContext context) throws Exception {
if (Boolean.valueOf(context.getSpecification().getProperty(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
super.initialize(context);
Map<String, String> properties = context.getSpecification().getProperties();
appName = context.getApplicationSpecification().getName();
Preconditions.checkArgument(properties.containsKey(Constants.PIPELINEID));
Preconditions.checkArgument(properties.containsKey(UNIQUE_ID));
String uniqueId = properties.get(UNIQUE_ID);
// Each worker instance should have its own unique state.
final String appName = context.getApplicationSpecification().getName();
stateStoreKey = String.format("%s%s%s%s%s", appName, SEPARATOR, uniqueId, SEPARATOR, context.getInstanceId());
stateStoreKeyBytes = Bytes.toBytes(stateStoreKey);
Transactionals.execute(getContext(), new TxRunnable() {
@Override
public void run(DatasetContext dsContext) throws Exception {
KeyValueTable stateTable = dsContext.getDataset(ETLRealtimeApplication.STATE_TABLE);
byte[] startKey = Bytes.toBytes(String.format("%s%s", appName, SEPARATOR));
// Scan the table for appname: prefixes and remove rows which doesn't match the unique id of this application.
try (CloseableIterator<KeyValue<byte[], byte[]>> rows = stateTable.scan(startKey, Bytes.stopKeyForPrefix(startKey))) {
while (rows.hasNext()) {
KeyValue<byte[], byte[]> row = rows.next();
if (Bytes.compareTo(stateStoreKeyBytes, row.getKey()) != 0) {
stateTable.delete(row.getKey());
}
}
}
}
}, Exception.class);
PipelinePhase pipeline = GSON.fromJson(properties.get(Constants.PIPELINEID), PipelinePhase.class);
Map<String, TransformDetail> transformationMap = new HashMap<>();
initializeSource(context, pipeline);
initializeTransforms(context, transformationMap, pipeline);
initializeSinks(context, transformationMap, pipeline);
Set<String> startStages = new HashSet<>();
startStages.addAll(pipeline.getStageOutputs(sourceStageName));
transformExecutor = new TransformExecutor(transformationMap, startStages);
}
use of co.cask.cdap.api.data.DatasetContext in project cdap by caskdata.
the class DefaultNamespaceStore method update.
@Override
public void update(final NamespaceMeta metadata) {
Preconditions.checkArgument(metadata != null, "Namespace metadata cannot be null.");
Transactions.executeUnchecked(transactional, new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
NamespaceMDS mds = getNamespaceMDS(context);
NamespaceMeta existing = mds.get(metadata.getNamespaceId());
if (existing != null) {
mds.create(metadata);
}
}
});
}
Aggregations